agent-relay-runner 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-relay-runner",
3
- "version": "0.16.0",
3
+ "version": "0.17.0",
4
4
  "description": "Unified provider lifecycle runner for Agent Relay",
5
5
  "type": "module",
6
6
  "bin": {
@@ -20,7 +20,7 @@
20
20
  "directory": "runner"
21
21
  },
22
22
  "dependencies": {
23
- "agent-relay-sdk": "0.2.7"
23
+ "agent-relay-sdk": "0.2.8"
24
24
  },
25
25
  "devDependencies": {
26
26
  "@types/bun": "latest",
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "agent-relay-runner",
3
3
  "description": "Thin Agent Relay runner bridge for Claude Code",
4
- "version": "0.16.0",
4
+ "version": "0.17.0",
5
5
  "agentRelayContracts": {
6
6
  "providerPluginProtocol": 1
7
7
  }
package/src/outbox.ts ADDED
@@ -0,0 +1,303 @@
1
+ import { Database } from "bun:sqlite";
2
+ import { mkdirSync } from "node:fs";
3
+ import { dirname, join } from "node:path";
4
+ import { tmpdir } from "node:os";
5
+ import { logger } from "./logger";
6
+
7
+ // Phase 2 (#196) — the "nothing is ever lost" half. Runner→server events that used to be
8
+ // fire-and-forget over HTTP (session turns, reasoning/tool traces, prompt echoes, insights,
9
+ // hook-fatal reports) were silently dropped whenever the server was momentarily down. This
10
+ // is a durable, FIFO, disk-backed queue that:
11
+ // - survives Runner/server restart (bun:sqlite file in the runtime dir),
12
+ // - stamps true event time (`occurredAt`) once at enqueue and preserves it through retries,
13
+ // - retries with capped exponential backoff, strictly in order (an append log must not
14
+ // reorder turns),
15
+ // - poisons a permanently-failing head after maxAttempts so it can't block the queue,
16
+ // - is bounded with a logged drop policy (never silently truncates).
17
+ //
18
+ // Status deliberately does NOT go through here: it rides the WebSocket bus, which is
19
+ // last-wins and self-heals on reconnect (so it already satisfies "coalesce, don't replay
20
+ // stale busyes"). The coalesce mode below exists so a future state event could migrate here.
21
+
22
+ export type OutboxMode = "append" | "coalesce";
23
+
24
+ export interface OutboxEventInput {
25
+ kind: string;
26
+ payload: unknown;
27
+ mode?: OutboxMode;
28
+ // Required for coalesce mode: prior un-poisoned rows with the same dedupeKey are replaced.
29
+ dedupeKey?: string;
30
+ // Defaults to now. Set explicitly only to backdate (e.g. replaying a captured timestamp).
31
+ occurredAt?: number;
32
+ // Defaults to a stable derived key so server-side dedup makes retries exactly-once.
33
+ idempotencyKey?: string;
34
+ }
35
+
36
+ export interface OutboxRecord {
37
+ seq: number;
38
+ kind: string;
39
+ mode: OutboxMode;
40
+ occurredAt: number;
41
+ idempotencyKey: string;
42
+ payload: unknown;
43
+ attempts: number;
44
+ }
45
+
46
+ // The transport. Resolve = delivered (row deleted). Reject = failed (retried with backoff).
47
+ export type OutboxSend = (record: OutboxRecord) => Promise<void>;
48
+
49
+ export interface OutboxOptions {
50
+ agentId: string;
51
+ send: OutboxSend;
52
+ // Storage directory. Defaults to AGENT_RELAY_RUNNER_OUTBOX_DIR, else a per-host temp dir.
53
+ dir?: string;
54
+ maxRows?: number;
55
+ maxAttempts?: number;
56
+ baseBackoffMs?: number;
57
+ maxBackoffMs?: number;
58
+ pollMs?: number;
59
+ }
60
+
61
+ const DEFAULTS = {
62
+ maxRows: 5000,
63
+ maxAttempts: 12,
64
+ baseBackoffMs: 1_000,
65
+ maxBackoffMs: 60_000,
66
+ pollMs: 5_000,
67
+ };
68
+
69
+ interface Row {
70
+ seq: number;
71
+ kind: string;
72
+ mode: string;
73
+ occurred_at: number;
74
+ idempotency_key: string;
75
+ payload: string;
76
+ attempts: number;
77
+ next_attempt_at: number;
78
+ poisoned: number;
79
+ }
80
+
81
+ export class Outbox {
82
+ private readonly db: Database;
83
+ private readonly agentId: string;
84
+ private readonly send: OutboxSend;
85
+ private readonly maxRows: number;
86
+ private readonly maxAttempts: number;
87
+ private readonly baseBackoffMs: number;
88
+ private readonly maxBackoffMs: number;
89
+ private readonly pollMs: number;
90
+ readonly path: string;
91
+
92
+ private draining = false;
93
+ private rerun = false;
94
+ private pollTimer?: ReturnType<typeof setInterval>;
95
+ private dueTimer?: ReturnType<typeof setTimeout>;
96
+ private stopped = false;
97
+
98
+ constructor(options: OutboxOptions) {
99
+ this.agentId = options.agentId;
100
+ this.send = options.send;
101
+ this.maxRows = options.maxRows ?? DEFAULTS.maxRows;
102
+ this.maxAttempts = options.maxAttempts ?? DEFAULTS.maxAttempts;
103
+ this.baseBackoffMs = options.baseBackoffMs ?? DEFAULTS.baseBackoffMs;
104
+ this.maxBackoffMs = options.maxBackoffMs ?? DEFAULTS.maxBackoffMs;
105
+ this.pollMs = options.pollMs ?? DEFAULTS.pollMs;
106
+
107
+ const dir = options.dir ?? process.env.AGENT_RELAY_RUNNER_OUTBOX_DIR ?? join(tmpdir(), "agent-relay-outbox");
108
+ this.path = options.dir === ":memory:" ? ":memory:" : join(dir, `outbox-${safeName(this.agentId)}.sqlite`);
109
+ if (this.path !== ":memory:") mkdirSync(dirname(this.path), { recursive: true });
110
+
111
+ this.db = new Database(this.path, { create: true });
112
+ this.db.exec("PRAGMA journal_mode = WAL");
113
+ this.db.exec("PRAGMA busy_timeout = 2000");
114
+ this.db.exec(`
115
+ CREATE TABLE IF NOT EXISTS outbox (
116
+ seq INTEGER PRIMARY KEY AUTOINCREMENT,
117
+ kind TEXT NOT NULL,
118
+ mode TEXT NOT NULL DEFAULT 'append',
119
+ dedupe_key TEXT,
120
+ occurred_at INTEGER NOT NULL,
121
+ idempotency_key TEXT NOT NULL,
122
+ payload TEXT NOT NULL,
123
+ attempts INTEGER NOT NULL DEFAULT 0,
124
+ next_attempt_at INTEGER NOT NULL DEFAULT 0,
125
+ poisoned INTEGER NOT NULL DEFAULT 0,
126
+ created_at INTEGER NOT NULL
127
+ )
128
+ `);
129
+ // A restart is a fresh start: clear any backoff timers left by the prior process so
130
+ // pending events get an immediate retry (the down server may now be back). `attempts`
131
+ // is kept so the poison threshold still counts cumulative failures.
132
+ this.db.exec("UPDATE outbox SET next_attempt_at = 0 WHERE next_attempt_at > 0");
133
+ }
134
+
135
+ // Persist an event. Returns the assigned seq. Triggers a drain.
136
+ enqueue(input: OutboxEventInput): number {
137
+ if (this.stopped) throw new Error("outbox is stopped");
138
+ const mode: OutboxMode = input.mode ?? "append";
139
+ const occurredAt = input.occurredAt ?? Date.now();
140
+ const payloadJson = JSON.stringify(input.payload ?? null);
141
+ const idempotencyKey = input.idempotencyKey ?? `${this.agentId}:${input.kind}:${occurredAt}:${shortHash(payloadJson)}`;
142
+
143
+ if (mode === "coalesce") {
144
+ if (!input.dedupeKey) throw new Error("coalesce mode requires a dedupeKey");
145
+ this.db.query("DELETE FROM outbox WHERE dedupe_key = ? AND poisoned = 0").run(input.dedupeKey);
146
+ }
147
+
148
+ const info = this.db
149
+ .query(`INSERT INTO outbox (kind, mode, dedupe_key, occurred_at, idempotency_key, payload, created_at)
150
+ VALUES (?, ?, ?, ?, ?, ?, ?)`)
151
+ .run(input.kind, mode, input.dedupeKey ?? null, occurredAt, idempotencyKey, payloadJson, Date.now());
152
+ const seq = Number(info.lastInsertRowid);
153
+
154
+ this.enforceBound();
155
+ // Defer the drain to a microtask so a synchronous burst of enqueues (e.g. several
156
+ // coalesce updates) all land — and coalesce — before the pump pulls the head.
157
+ queueMicrotask(() => { void this.drain(); });
158
+ return seq;
159
+ }
160
+
161
+ // Bounded ring buffer: if over capacity, drop the oldest rows (defined overflow policy).
162
+ // Logged, never silent. Prefers dropping already-poisoned rows first, then oldest by seq.
163
+ private enforceBound(): void {
164
+ const { n } = this.db.query("SELECT count(*) AS n FROM outbox").get() as { n: number };
165
+ if (n <= this.maxRows) return;
166
+ const overflow = n - this.maxRows;
167
+ // Oldest poisoned first, then oldest live — both by seq.
168
+ const victims = this.db
169
+ .query("SELECT seq FROM outbox ORDER BY poisoned DESC, seq ASC LIMIT ?")
170
+ .all(overflow) as Array<{ seq: number }>;
171
+ const ids = victims.map((v) => v.seq);
172
+ if (ids.length === 0) return;
173
+ const placeholders = ids.map(() => "?").join(",");
174
+ this.db.query(`DELETE FROM outbox WHERE seq IN (${placeholders})`).run(...ids);
175
+ logger.warn("outbox", `bound exceeded (${n}/${this.maxRows}) — dropped ${ids.length} oldest event(s)`);
176
+ }
177
+
178
+ // Begin the background pump: an initial drain plus a poll timer as a backstop.
179
+ start(): void {
180
+ if (this.pollTimer || this.stopped) return;
181
+ void this.drain();
182
+ this.pollTimer = setInterval(() => { void this.drain(); }, this.pollMs);
183
+ this.pollTimer.unref?.();
184
+ }
185
+
186
+ // Process the queue strictly oldest-first. Coalesces concurrent calls; if a drain is
187
+ // requested while one is running, it re-runs once at the end (so an enqueue during a
188
+ // send isn't missed).
189
+ async drain(): Promise<void> {
190
+ if (this.stopped) return;
191
+ if (this.draining) { this.rerun = true; return; }
192
+ this.draining = true;
193
+ try {
194
+ do {
195
+ this.rerun = false;
196
+ await this.drainOnce();
197
+ } while (this.rerun && !this.stopped);
198
+ } finally {
199
+ this.draining = false;
200
+ }
201
+ }
202
+
203
+ private async drainOnce(): Promise<void> {
204
+ for (;;) {
205
+ if (this.stopped) return;
206
+ const row = this.db
207
+ .query("SELECT * FROM outbox WHERE poisoned = 0 ORDER BY seq ASC LIMIT 1")
208
+ .get() as Row | null;
209
+ if (!row) return;
210
+
211
+ const now = Date.now();
212
+ if (row.next_attempt_at > now) {
213
+ // Head isn't due yet. Don't reorder past it (FIFO) — schedule a wake-up and stop.
214
+ this.scheduleDue(row.next_attempt_at - now);
215
+ return;
216
+ }
217
+
218
+ const record: OutboxRecord = {
219
+ seq: row.seq,
220
+ kind: row.kind,
221
+ mode: row.mode as OutboxMode,
222
+ occurredAt: row.occurred_at,
223
+ idempotencyKey: row.idempotency_key,
224
+ payload: safeParse(row.payload),
225
+ attempts: row.attempts,
226
+ };
227
+
228
+ try {
229
+ await this.send(record);
230
+ this.db.query("DELETE FROM outbox WHERE seq = ?").run(row.seq);
231
+ } catch (error) {
232
+ const attempts = row.attempts + 1;
233
+ const reason = error instanceof Error ? error.message : String(error);
234
+ if (attempts >= this.maxAttempts) {
235
+ this.db.query("UPDATE outbox SET attempts = ?, poisoned = 1 WHERE seq = ?").run(attempts, row.seq);
236
+ logger.fatal("outbox", `event seq=${row.seq} kind=${row.kind} poisoned after ${attempts} attempts: ${reason}`);
237
+ // Move on — the next iteration picks the new head (poison no longer blocks).
238
+ continue;
239
+ }
240
+ const delay = this.backoff(attempts);
241
+ this.db.query("UPDATE outbox SET attempts = ?, next_attempt_at = ? WHERE seq = ?").run(attempts, now + delay, row.seq);
242
+ logger.debug("outbox", `event seq=${row.seq} kind=${row.kind} retry ${attempts}/${this.maxAttempts} in ${delay}ms: ${reason}`);
243
+ this.scheduleDue(delay);
244
+ return; // head is now scheduled; stop until it's due (preserve order)
245
+ }
246
+ }
247
+ }
248
+
249
+ private backoff(attempts: number): number {
250
+ const exp = Math.min(this.maxBackoffMs, this.baseBackoffMs * 2 ** (attempts - 1));
251
+ return Math.round(exp / 2 + Math.random() * (exp / 2)); // full-ish jitter, never below half
252
+ }
253
+
254
+ private scheduleDue(delayMs: number): void {
255
+ if (this.stopped || this.dueTimer) return;
256
+ this.dueTimer = setTimeout(() => {
257
+ this.dueTimer = undefined;
258
+ void this.drain();
259
+ }, Math.max(0, delayMs));
260
+ this.dueTimer.unref?.();
261
+ }
262
+
263
+ // Observability / tests.
264
+ pendingCount(): number {
265
+ return (this.db.query("SELECT count(*) AS n FROM outbox WHERE poisoned = 0").get() as { n: number }).n;
266
+ }
267
+
268
+ poisonedCount(): number {
269
+ return (this.db.query("SELECT count(*) AS n FROM outbox WHERE poisoned = 1").get() as { n: number }).n;
270
+ }
271
+
272
+ stop(): void {
273
+ this.stopped = true;
274
+ if (this.pollTimer) clearInterval(this.pollTimer);
275
+ this.pollTimer = undefined;
276
+ if (this.dueTimer) clearTimeout(this.dueTimer);
277
+ this.dueTimer = undefined;
278
+ }
279
+
280
+ close(): void {
281
+ this.stop();
282
+ try { this.db.close(); } catch { /* already closed */ }
283
+ }
284
+ }
285
+
286
+ function safeName(value: string): string {
287
+ return value.replace(/[^a-zA-Z0-9_.-]+/g, "_").slice(0, 180) || "agent";
288
+ }
289
+
290
+ function safeParse(json: string): unknown {
291
+ try { return JSON.parse(json); } catch { return null; }
292
+ }
293
+
294
+ // Small, fast, stable string hash (FNV-1a, 32-bit) — enough to disambiguate identical
295
+ // kind+timestamp payloads in the idempotency key. Not security-sensitive.
296
+ function shortHash(value: string): string {
297
+ let h = 0x811c9dc5;
298
+ for (let i = 0; i < value.length; i++) {
299
+ h ^= value.charCodeAt(i);
300
+ h = Math.imul(h, 0x01000193);
301
+ }
302
+ return (h >>> 0).toString(36);
303
+ }
@@ -45,15 +45,29 @@ export function workspaceDepsNote(input: { mode?: string | null; depsMode?: stri
45
45
  }
46
46
  }
47
47
 
48
- /** Resolve the workspace deps caveat from the runner/monitor environment.
48
+ /**
49
+ * Caveat for untracked paths symlinked from main into an isolated worktree
50
+ * (WorkspaceConfig.symlinkPaths, e.g. AGENTS.md, .claude-rig). Edits to these
51
+ * write THROUGH to the main checkout — the agent must know so it doesn't mutate
52
+ * shared config thinking it's worktree-local. Returns "" when nothing was linked.
53
+ */
54
+ export function workspaceSymlinksNote(linked: string[]): string {
55
+ if (!linked.length) return "";
56
+ return `[agent-relay] Isolated workspace: these untracked paths are SYMLINKED from the main checkout: ${linked.join(", ")}. They resolve to the real files in main, so editing or deleting them writes THROUGH to main — treat them as read-only unless you intend to change main.`;
57
+ }
58
+
59
+ /** Resolve the workspace caveats from the runner/monitor environment.
49
60
  * AGENT_RELAY_WORKSPACE_JSON carries the resolved workspace metadata (mode +
50
- * deps) and is the authoritative source. Best-effort: never throws. */
61
+ * deps + symlinks) and is the authoritative source. Best-effort: never throws. */
51
62
  export function workspaceDepsNoteFromEnv(env: Record<string, string | undefined> = process.env): string {
52
63
  const json = env.AGENT_RELAY_WORKSPACE_JSON;
53
64
  if (!json) return "";
54
65
  try {
55
- const parsed = JSON.parse(json) as { mode?: string; deps?: { mode?: string } };
56
- return workspaceDepsNote({ mode: parsed.mode ?? null, depsMode: parsed.deps?.mode ?? null });
66
+ const parsed = JSON.parse(json) as { mode?: string; deps?: { mode?: string }; symlinks?: { linked?: string[] } };
67
+ return [
68
+ workspaceDepsNote({ mode: parsed.mode ?? null, depsMode: parsed.deps?.mode ?? null }),
69
+ parsed.mode === "isolated" ? workspaceSymlinksNote(parsed.symlinks?.linked ?? []) : "",
70
+ ].filter(Boolean).join("\n\n");
57
71
  } catch {
58
72
  return "";
59
73
  }
@@ -0,0 +1,109 @@
1
+ import type { ReplyObligation } from "agent-relay-sdk";
2
+ import { logger } from "./logger";
3
+
4
+ // Phase 2 (#196) — the crux. The Claude Stop hook used to ask the server, synchronously
5
+ // and in the hot path, "does this agent owe a reply?" before clearing the turn. A slow
6
+ // server answer (the unindexed reply_to scan, #199) blew the hook's timeout and wedged the
7
+ // agent in `busy` forever. The fix: the hook asks the Runner, the Runner answers instantly
8
+ // from this local snapshot, and the snapshot is refreshed from the server only in the
9
+ // background — never on the path that ends a turn.
10
+ //
11
+ // Design rules:
12
+ // - `get()` is synchronous, never throws, never touches the network.
13
+ // - `refresh()` is the only thing that talks to the server; it coalesces concurrent calls
14
+ // and, on failure, keeps the last-known snapshot (stale-but-serving beats blocking).
15
+ // - A background interval keeps the snapshot warm; `markDirty()` requests an extra,
16
+ // debounced refresh when state likely just changed (a message arrived, a turn ended).
17
+
18
+ export type ReplyObligationFetch = () => Promise<ReplyObligation[]>;
19
+
20
+ export interface ReplyObligationCacheOptions {
21
+ fetch: ReplyObligationFetch;
22
+ // Background freshness backstop. Default 10s — well under any turn cadence, cheap.
23
+ intervalMs?: number;
24
+ // Debounce window for markDirty()-triggered refreshes so a burst of events
25
+ // (e.g. a fan-out of messages) collapses into one server round-trip.
26
+ dirtyDebounceMs?: number;
27
+ }
28
+
29
+ const DEFAULT_INTERVAL_MS = 10_000;
30
+ const DEFAULT_DIRTY_DEBOUNCE_MS = 400;
31
+
32
+ export class ReplyObligationCache {
33
+ private readonly fetch: ReplyObligationFetch;
34
+ private readonly intervalMs: number;
35
+ private readonly dirtyDebounceMs: number;
36
+
37
+ private snapshot: ReplyObligation[] = [];
38
+ private lastRefreshedAt = 0;
39
+ private inFlight: Promise<void> | null = null;
40
+ private intervalTimer?: ReturnType<typeof setInterval>;
41
+ private dirtyTimer?: ReturnType<typeof setTimeout>;
42
+ private stopped = false;
43
+
44
+ constructor(options: ReplyObligationCacheOptions) {
45
+ this.fetch = options.fetch;
46
+ this.intervalMs = options.intervalMs ?? DEFAULT_INTERVAL_MS;
47
+ this.dirtyDebounceMs = options.dirtyDebounceMs ?? DEFAULT_DIRTY_DEBOUNCE_MS;
48
+ }
49
+
50
+ // Synchronous, hot-path-safe read. Returns a copy so callers can't mutate the snapshot.
51
+ get(): ReplyObligation[] {
52
+ return this.snapshot.slice();
53
+ }
54
+
55
+ getLastRefreshedAt(): number {
56
+ return this.lastRefreshedAt;
57
+ }
58
+
59
+ // Begin the background freshness loop and prime the first snapshot immediately.
60
+ start(): void {
61
+ if (this.intervalTimer || this.stopped) return;
62
+ void this.refresh();
63
+ this.intervalTimer = setInterval(() => { void this.refresh(); }, this.intervalMs);
64
+ // Don't keep the process alive solely for cache refreshes.
65
+ this.intervalTimer.unref?.();
66
+ }
67
+
68
+ stop(): void {
69
+ this.stopped = true;
70
+ if (this.intervalTimer) clearInterval(this.intervalTimer);
71
+ this.intervalTimer = undefined;
72
+ if (this.dirtyTimer) clearTimeout(this.dirtyTimer);
73
+ this.dirtyTimer = undefined;
74
+ }
75
+
76
+ // Request a refresh because state likely changed (message arrived / turn ended).
77
+ // Debounced so a burst collapses into a single server round-trip.
78
+ markDirty(): void {
79
+ if (this.stopped || this.dirtyTimer) return;
80
+ this.dirtyTimer = setTimeout(() => {
81
+ this.dirtyTimer = undefined;
82
+ void this.refresh();
83
+ }, this.dirtyDebounceMs);
84
+ this.dirtyTimer.unref?.();
85
+ }
86
+
87
+ // Fetch from the server and replace the snapshot. Coalesces concurrent callers onto a
88
+ // single in-flight request. Never rejects — a failed fetch leaves the prior snapshot in
89
+ // place (the hook keeps getting an answer even while the server is down).
90
+ refresh(): Promise<void> {
91
+ if (this.stopped) return Promise.resolve();
92
+ if (this.inFlight) return this.inFlight;
93
+ this.inFlight = this.doRefresh().finally(() => { this.inFlight = null; });
94
+ return this.inFlight;
95
+ }
96
+
97
+ private async doRefresh(): Promise<void> {
98
+ try {
99
+ const obligations = await this.fetch();
100
+ if (this.stopped) return;
101
+ this.snapshot = Array.isArray(obligations) ? obligations : [];
102
+ this.lastRefreshedAt = Date.now();
103
+ } catch (error) {
104
+ // Server-down is a non-event: keep serving the last snapshot. Debug, not error —
105
+ // this is expected during outages and must not spam the log.
106
+ logger.debug("obligation-cache", `refresh failed, serving cached snapshot (${this.snapshot.length}): ${error instanceof Error ? error.message : String(error)}`);
107
+ }
108
+ }
109
+ }
package/src/runner.ts CHANGED
@@ -2,13 +2,15 @@ import { hostname } from "node:os";
2
2
  import { closeSync, mkdirSync, openSync, readSync, statSync, writeFileSync } from "node:fs";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { dirname, join } from "node:path";
5
- import type { AgentProfile, ContextState, Message, MessageSessionMeta, ProviderCapabilities, TaskStatusInput, WorkspaceMetadata } from "agent-relay-sdk";
5
+ import type { AgentProfile, ContextState, Message, MessageSessionMeta, ProviderCapabilities, SendMessageInput, TaskStatusInput, WorkspaceMetadata } from "agent-relay-sdk";
6
6
  import { RelayBusClient, RelayHttpClient } from "agent-relay-sdk";
7
7
  import { contextStateFromProbeMetrics, readContextProbeState } from "agent-relay-sdk/context-probe";
8
8
  import type { ManagedProcess, ProviderAdapter, ProviderConfig, ProviderPermissionDecision, ProviderPermissionDecisionInput, ProviderSessionEvent, ProviderStatusUpdate, RunnerSpawnConfig, SemanticStatus, TerminalAttachSpec } from "./adapter";
9
9
  import { messagesWithCachedAttachments } from "./attachment-cache";
10
10
  import { ClaimTracker } from "./claim-tracker";
11
11
  import { startControlServer, type ControlServer } from "./control-server";
12
+ import { ReplyObligationCache } from "./reply-obligation-cache";
13
+ import { Outbox, type OutboxRecord } from "./outbox";
12
14
  import { extractLastAssistantTurn, extractFinalAssistantMessage, extractHookAssistantMessage, extractLatestTurnSteps, transcriptLooksComplete, analyzeSession } from "./adapters/claude-transcript";
13
15
  import { agentProfileProjectionReport } from "./profile-projection";
14
16
  import { profileUsesHostProviderGlobals } from "./profile-home";
@@ -116,6 +118,13 @@ export class AgentRunner {
116
118
  private readonly claims = new ClaimTracker();
117
119
  private readonly http: RelayHttpClient;
118
120
  private readonly bus: RelayBusClient;
121
+ // Phase 2 (#196): the Stop hook reads reply obligations from this local snapshot, never
122
+ // from the server — so a slow server can no longer wedge a turn (the crux fix).
123
+ private readonly obligationCache: ReplyObligationCache;
124
+ // Phase 2 (#196): Runner→server append-log events (session turns, reasoning, prompts,
125
+ // insights, hook-fatal) go through this durable, disk-backed, timestamped queue instead of
126
+ // direct fire-and-forget HTTP — so nothing is lost across a server/Runner restart.
127
+ private readonly outbox: Outbox;
119
128
  private currentToken?: string;
120
129
  private currentTokenJti?: string;
121
130
  private currentTokenProfileId?: string;
@@ -192,6 +201,12 @@ export class AgentRunner {
192
201
  this.currentTokenExpiresAt = options.tokenExpiresAt;
193
202
  const runtime = runtimeMetadata(options.provider);
194
203
  this.http = new RelayHttpClient({ baseUrl: options.relayUrl, token: this.currentToken });
204
+ this.obligationCache = new ReplyObligationCache({ fetch: () => this.http.listReplyObligations(this.agentId) });
205
+ // Co-locate the durable outbox with the runner's runtime state (survives reboot) when the
206
+ // orchestrator told us where that is; otherwise the Outbox falls back to a temp dir.
207
+ const outboxDir = process.env.AGENT_RELAY_RUNNER_OUTBOX_DIR
208
+ ?? (process.env.AGENT_RELAY_RUNNER_INFO_FILE ? join(dirname(process.env.AGENT_RELAY_RUNNER_INFO_FILE), "outbox") : undefined);
209
+ this.outbox = new Outbox({ agentId: this.agentId, dir: outboxDir, send: (record) => this.deliverOutboxEvent(record) });
195
210
  this.bus = new RelayBusClient({
196
211
  url: relayBusUrl(options.relayUrl),
197
212
  role: "provider",
@@ -260,10 +275,13 @@ export class AgentRunner {
260
275
  this.control = startControlServer({
261
276
  onStatus: (status) => this.setProviderStatus(status),
262
277
  onTerminalAttachSpec: () => this.terminalAttachSpec(),
263
- onReplyObligations: () => this.http.listReplyObligations(this.agentId),
278
+ // Hot-path-safe: answered instantly from the local snapshot, never a server
279
+ // round-trip. The snapshot is kept warm by the background refresh below (#196).
280
+ onReplyObligations: () => Promise.resolve(this.obligationCache.get()),
264
281
  onSessionTurn: (input) => this.publishSessionTurn(input),
265
282
  onUserPrompt: (input) => this.handleUserPrompt(input),
266
283
  onSessionEnd: (input) => this.handleSessionEnd(input),
284
+ onHookFatal: (report) => this.reportHookFatal(report),
267
285
  });
268
286
  this.writeRunnerInfoFile();
269
287
  this.options.adapter.onStatusChange((status) => {
@@ -277,12 +295,19 @@ export class AgentRunner {
277
295
  if (runnerShouldResolveProviderExit(semanticStatus, this.exitCommandInProgress)) this.options.onProviderExit?.(semanticStatus === "offline" ? 0 : 1);
278
296
  });
279
297
  this.options.adapter.onSessionEvent?.((event) => { void this.publishProviderSessionEvent(event); });
280
- this.bus.on("message.new", (message) => this.enqueueMessage(message as Message));
298
+ this.bus.on("message.new", (message) => {
299
+ // A delivered message may create a new reply obligation — warm the snapshot so the
300
+ // next turn-end sees it without a hot-path server read.
301
+ this.obligationCache.markDirty();
302
+ this.enqueueMessage(message as Message);
303
+ });
281
304
  this.bus.on("command", (type, params, commandId, command) => {
282
305
  void this.handleCommand(type, params, commandId, command);
283
306
  });
284
307
  this.bus.on("error", (code, message) => this.handleBusError(String(code), String(message)));
285
308
  await this.bus.connect();
309
+ this.obligationCache.start();
310
+ this.outbox.start();
286
311
  this.ensureScratch();
287
312
  void this.sweepStaleScratch();
288
313
  this.process = await this.spawnProvider();
@@ -322,6 +347,8 @@ export class AgentRunner {
322
347
  this.tokenRenewTimer = undefined;
323
348
  this.disarmBusyReconciler();
324
349
  this.stopReasoningTail();
350
+ this.obligationCache.stop();
351
+ this.outbox.close();
325
352
  this.control?.stop();
326
353
  await this.bus.close();
327
354
  }
@@ -927,13 +954,10 @@ export class AgentRunner {
927
954
  replyToMessageId = pendingPrompt;
928
955
  this.pendingPromptMessageId = undefined;
929
956
  } else {
930
- try {
931
- const obligations = await this.http.listReplyObligations(this.agentId);
932
- const obligation = [...obligations].reverse().find((o) => o.from === "user");
933
- replyToMessageId = obligation?.messageId;
934
- } catch {
935
- // fall through and capture without correlation
936
- }
957
+ // Correlation-only (threading + obligation clearing) — the local snapshot is fresh
958
+ // enough and never blocks the response-capture path (#196).
959
+ const obligation = [...this.obligationCache.get()].reverse().find((o) => o.from === "user");
960
+ replyToMessageId = obligation?.messageId;
937
961
  }
938
962
 
939
963
  // The Stop hook can fire before the final assistant entry is flushed to disk.
@@ -975,31 +999,86 @@ export class AgentRunner {
975
999
  ...(replyToMessageId ? { replyTo: replyToMessageId } : {}),
976
1000
  session: { type: "response", origin: "provider", ...(turnId ? { turnId } : {}) },
977
1001
  });
1002
+ // The agent's reply may have cleared an obligation — refresh the snapshot so the next
1003
+ // turn-end doesn't re-prompt for a message already answered (#196).
1004
+ if (replyToMessageId) this.obligationCache.markDirty();
978
1005
  }
979
1006
 
980
1007
  // Post one session-mirror event (prompt echo, assistant response, reasoning or
981
1008
  // tool step) as a `kind: "session"` relay message tagged with payload.session so
982
1009
  // the dashboard can render the live provider session faithfully. Display-only:
983
1010
  // session messages are never delivered back into a provider.
984
- private async publishSessionEvent(input: {
1011
+ private publishSessionEvent(input: {
985
1012
  from: string;
986
1013
  to: string;
987
1014
  body: string;
988
1015
  session: MessageSessionMeta;
989
1016
  replyTo?: number;
990
- }): Promise<void> {
991
- try {
992
- await this.http.sendMessage({
1017
+ }): void {
1018
+ // Durable, ordered, timestamped (#196): the actual POST happens in deliverOutboxEvent,
1019
+ // retried until it lands. occurredAt is stamped now so a queued event reports when it
1020
+ // truly happened, not when the server finally accepted it.
1021
+ this.outbox.enqueue({
1022
+ kind: "session-message",
1023
+ payload: {
993
1024
  from: input.from,
994
1025
  to: input.to,
995
1026
  ...(input.replyTo ? { replyTo: input.replyTo } : {}),
996
1027
  kind: "session",
997
1028
  body: input.body,
998
1029
  payload: { session: { provider: this.options.provider, ...input.session } },
1030
+ } satisfies SendMessageInput,
1031
+ });
1032
+ }
1033
+
1034
+ // The outbox transport: map a queued record to its HTTP call. Throw to retry, return to
1035
+ // ack (delete). occurredAt + idempotencyKey are injected from the record so retries are
1036
+ // exactly-once server-side and carry true event time.
1037
+ private async deliverOutboxEvent(record: OutboxRecord): Promise<void> {
1038
+ try {
1039
+ if (record.kind === "session-message") {
1040
+ await this.http.sendMessage({
1041
+ ...(record.payload as SendMessageInput),
1042
+ occurredAt: record.occurredAt,
1043
+ idempotencyKey: record.idempotencyKey,
1044
+ });
1045
+ return;
1046
+ }
1047
+ if (record.kind === "insight") {
1048
+ await this.http.recordInsightObservation({
1049
+ ...(record.payload as Parameters<RelayHttpClient["recordInsightObservation"]>[0]),
1050
+ occurredAt: record.occurredAt,
1051
+ });
1052
+ return;
1053
+ }
1054
+ logger.warn("outbox", `dropping event with unknown kind: ${record.kind}`);
1055
+ } catch (error) {
1056
+ // 409 = the server intentionally rejected it (e.g. Insights/feature toggled off). That
1057
+ // is a permanent "don't want this", not a transient failure — ack so it doesn't retry.
1058
+ if (isHttpStatusError(error, 409)) return;
1059
+ if (isHttpAuthError(error)) this.recoverRuntimeTokenAfterAuthFailure("outbox");
1060
+ throw error; // transient (or auth, post-recovery) → let the outbox retry with backoff
1061
+ }
1062
+ }
1063
+
1064
+ // A hook reported an unhandled failure (#198 seam). Already logged FATAL by the control
1065
+ // server; here we additionally surface it durably to the server as a generic insight so
1066
+ // it shows up in observability rather than only in the per-agent log (#196).
1067
+ private reportHookFatal(report: { hook: string; error: string }): void {
1068
+ try {
1069
+ this.outbox.enqueue({
1070
+ kind: "insight",
1071
+ payload: {
1072
+ sessionId: this.providerSessionId,
1073
+ project: this.options.cwd,
1074
+ agentId: this.agentId,
1075
+ signal: "hook_fatal",
1076
+ value: { hook: report.hook, error: report.error },
1077
+ source: "server",
1078
+ },
999
1079
  });
1000
1080
  } catch (error) {
1001
- this.logRunnerDiagnostic(`session ${input.session.type} capture failed: ${error instanceof Error ? error.message : String(error)}`);
1002
- if (isHttpAuthError(error)) this.recoverRuntimeTokenAfterAuthFailure("session-capture");
1081
+ logger.error("outbox", `failed to queue hook-fatal report: ${error instanceof Error ? error.message : String(error)}`);
1003
1082
  }
1004
1083
  }
1005
1084
 
@@ -1043,8 +1122,11 @@ export class AgentRunner {
1043
1122
  }
1044
1123
  const analysis = analyzeSession(jsonl);
1045
1124
  if (!analysis) return; // no tool calls = nothing substantive to measure
1046
- try {
1047
- await this.http.recordInsightObservation({
1125
+ // Durable + non-blocking (#196): queue it. SessionEnd can race provider shutdown, so a
1126
+ // direct POST risked being dropped if the server hiccuped; the outbox survives that.
1127
+ this.outbox.enqueue({
1128
+ kind: "insight",
1129
+ payload: {
1048
1130
  sessionId: this.providerSessionId,
1049
1131
  project: this.options.cwd,
1050
1132
  agentId: this.agentId,
@@ -1052,13 +1134,9 @@ export class AgentRunner {
1052
1134
  value: { ...analysis.metric, ...(input.reason ? { endReason: input.reason } : {}) },
1053
1135
  outcome: { ...analysis.outcome },
1054
1136
  source: "server",
1055
- });
1056
- this.sessionLog(`insights: context_ratio ${analysis.metric.ratio.toFixed(2)} (${analysis.metric.gatheringCalls}/${analysis.metric.totalToolCalls} gathering)`);
1057
- } catch (error) {
1058
- // 409 = Insights/feature toggled off; anything else is best-effort too.
1059
- this.sessionDebug(`insights context_ratio skipped: ${error instanceof Error ? error.message : String(error)}`);
1060
- if (isHttpAuthError(error)) this.recoverRuntimeTokenAfterAuthFailure("insights");
1061
- }
1137
+ },
1138
+ });
1139
+ this.sessionLog(`insights: context_ratio ${analysis.metric.ratio.toFixed(2)} (${analysis.metric.gatheringCalls}/${analysis.metric.totalToolCalls} gathering) queued`);
1062
1140
  }
1063
1141
 
1064
1142
  // Route a provider-emitted session event (Codex app-server) into the chat mirror.
@@ -1087,13 +1165,9 @@ export class AgentRunner {
1087
1165
  if (pendingPrompt) {
1088
1166
  replyToMessageId = pendingPrompt;
1089
1167
  this.pendingPromptMessageId = undefined;
1090
- } else {
1091
- try {
1092
- const obligations = await this.http.listReplyObligations(this.agentId);
1093
- if (obligations.some((o) => o.from === "user")) return;
1094
- } catch {
1095
- // capture anyway on lookup failure
1096
- }
1168
+ } else if (this.obligationCache.get().some((o) => o.from === "user")) {
1169
+ // The agent will answer the relay obligation itself — don't double-post (#196).
1170
+ return;
1097
1171
  }
1098
1172
  await this.publishSessionEvent({
1099
1173
  from: this.agentId,
@@ -1953,6 +2027,11 @@ function isHttpAuthError(error: unknown): boolean {
1953
2027
  return status === 401 || status === 403;
1954
2028
  }
1955
2029
 
2030
+ function isHttpStatusError(error: unknown, code: number): boolean {
2031
+ const status = typeof error === "object" && error !== null ? (error as { status?: unknown }).status : undefined;
2032
+ return status === code;
2033
+ }
2034
+
1956
2035
  function httpErrorKey(error: unknown): string {
1957
2036
  const status = typeof error === "object" && error !== null ? (error as { status?: unknown }).status : undefined;
1958
2037
  if (typeof status === "number") return `status:${status}`;