agent-relay-runner 0.15.1 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/plugins/claude/.claude-plugin/plugin.json +1 -1
- package/plugins/claude/hooks/permission-request.sh +2 -0
- package/plugins/claude/hooks/post-compact.sh +1 -0
- package/plugins/claude/hooks/pre-compact.sh +1 -0
- package/plugins/claude/hooks/relay-status.sh +24 -0
- package/plugins/claude/hooks/session-end.sh +1 -0
- package/plugins/claude/hooks/session-start.sh +1 -0
- package/plugins/claude/hooks/stop-failure.sh +1 -0
- package/plugins/claude/hooks/stop.sh +12 -2
- package/plugins/claude/hooks/subagent-start.sh +1 -0
- package/plugins/claude/hooks/subagent-stop.sh +1 -0
- package/plugins/claude/hooks/user-prompt-submit.sh +1 -0
- package/src/adapters/codex.ts +3 -2
- package/src/control-server.ts +43 -0
- package/src/logger.ts +97 -0
- package/src/outbox.ts +303 -0
- package/src/relay-instructions.ts +18 -4
- package/src/reply-obligation-cache.ts +109 -0
- package/src/runner.ts +146 -74
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-relay-runner",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.17.0",
|
|
4
4
|
"description": "Unified provider lifecycle runner for Agent Relay",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
"directory": "runner"
|
|
21
21
|
},
|
|
22
22
|
"dependencies": {
|
|
23
|
-
"agent-relay-sdk": "0.2.
|
|
23
|
+
"agent-relay-sdk": "0.2.8"
|
|
24
24
|
},
|
|
25
25
|
"devDependencies": {
|
|
26
26
|
"@types/bun": "latest",
|
|
@@ -92,6 +92,30 @@ relay_post_session_end() {
|
|
|
92
92
|
-d "$body" >/dev/null 2>&1 || true
|
|
93
93
|
}
|
|
94
94
|
|
|
95
|
+
# --- Hook FATAL surfacing (#198) -------------------------------------------
|
|
96
|
+
# A hook that dies unexpectedly must never be silent. relay_install_hook_guard
|
|
97
|
+
# arms an ERR trap that reports the failure FATAL to the runner control port,
|
|
98
|
+
# which logs it to the dashboard-surfaced per-agent log. Best-effort and bounded
|
|
99
|
+
# (--max-time 2) so the report itself can never blow the hook's timeout budget.
|
|
100
|
+
relay_hook_fatal_report() {
|
|
101
|
+
local hook="${1:-unknown}" detail="${2:-}"
|
|
102
|
+
local port="${AGENT_RELAY_RUNNER_PORT:-}"
|
|
103
|
+
[ -z "$port" ] && return 0
|
|
104
|
+
local body="{\"hook\":\"$(relay_json_escape "$hook")\",\"error\":\"$(relay_json_escape "$detail")\"}"
|
|
105
|
+
curl -fsS --max-time 2 -X POST "http://127.0.0.1:${port}/hook-fatal" \
|
|
106
|
+
-H 'Content-Type: application/json' \
|
|
107
|
+
-d "$body" >/dev/null 2>&1 || true
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
relay_install_hook_guard() {
|
|
111
|
+
RELAY_HOOK_NAME="${1:-unknown}"
|
|
112
|
+
# Fires on any unhandled failure under `set -e`/`set -u`/pipefail in the hook's
|
|
113
|
+
# main body, just before the shell exits. Reports, then lets the exit proceed.
|
|
114
|
+
# (ERR is not inherited into functions without `set -E`; this covers the top-level
|
|
115
|
+
# flow, which is where a silent death actually wedges a turn.)
|
|
116
|
+
trap 'relay_hook_err_rc=$?; relay_hook_fatal_report "${RELAY_HOOK_NAME:-unknown}" "exit ${relay_hook_err_rc}: ${BASH_COMMAND}"' ERR
|
|
117
|
+
}
|
|
118
|
+
|
|
95
119
|
relay_pending_reply_stop_decision() {
|
|
96
120
|
local port="${AGENT_RELAY_RUNNER_PORT:-}"
|
|
97
121
|
[ -z "$port" ] && return 0
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env bash
|
|
2
2
|
set -euo pipefail
|
|
3
3
|
source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
|
|
4
|
+
relay_install_hook_guard session-end
|
|
4
5
|
|
|
5
6
|
payload="$(cat || true)"
|
|
6
7
|
reason="$(relay_json_string_field reason "$payload")"
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env bash
|
|
2
2
|
set -euo pipefail
|
|
3
3
|
source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
|
|
4
|
+
relay_install_hook_guard session-start
|
|
4
5
|
|
|
5
6
|
payload="$(cat || true)"
|
|
6
7
|
source_kind="$(relay_json_string_field source "$payload")"
|
|
@@ -1,6 +1,14 @@
|
|
|
1
1
|
#!/usr/bin/env bash
|
|
2
2
|
set -euo pipefail
|
|
3
3
|
source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
|
|
4
|
+
relay_install_hook_guard stop
|
|
5
|
+
|
|
6
|
+
# Clearing the turn's busy state is the critical path (#199). Register it on EXIT
|
|
7
|
+
# so it runs even if a side-call below fails or times out under `set -e`. The one
|
|
8
|
+
# exception is the reply-obligation block path, which deliberately keeps the agent
|
|
9
|
+
# busy to answer — it opts out via the flag before exiting.
|
|
10
|
+
_relay_clear_idle_on_exit=1
|
|
11
|
+
trap '[ "${_relay_clear_idle_on_exit:-0}" = "1" ] && relay_post_status_clearing_subagents idle' EXIT
|
|
4
12
|
|
|
5
13
|
payload="$(cat || true)"
|
|
6
14
|
stop_hook_active="$(relay_json_bool_field stop_hook_active "$payload")"
|
|
@@ -8,12 +16,14 @@ if [ "$stop_hook_active" != "true" ]; then
|
|
|
8
16
|
last_assistant_msg="$(echo "$payload" | jq -c '.last_assistant_message // empty' 2>/dev/null || true)"
|
|
9
17
|
relay_post_session_turn "$(relay_json_string_field transcript_path "$payload")" "$last_assistant_msg"
|
|
10
18
|
# `|| true`: under `set -e`, a non-zero from the obligation check must never abort
|
|
11
|
-
# the hook before the idle-clear
|
|
19
|
+
# the hook before the idle-clear — clearing the turn is the critical path (#199).
|
|
12
20
|
stop_decision="$(relay_pending_reply_stop_decision || true)"
|
|
13
21
|
if [ -n "$stop_decision" ]; then
|
|
22
|
+
_relay_clear_idle_on_exit=0
|
|
14
23
|
printf '%s\n' "$stop_decision"
|
|
15
24
|
exit 0
|
|
16
25
|
fi
|
|
17
26
|
fi
|
|
18
27
|
|
|
19
|
-
|
|
28
|
+
# Normal turn end → the EXIT trap posts idle (always, even on an unexpected abort above).
|
|
29
|
+
exit 0
|
|
@@ -4,6 +4,7 @@ set -euo pipefail
|
|
|
4
4
|
PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
|
|
5
5
|
# shellcheck source=/dev/null
|
|
6
6
|
source "${PLUGIN_ROOT}/hooks/relay-status.sh"
|
|
7
|
+
relay_install_hook_guard subagent-start
|
|
7
8
|
|
|
8
9
|
payload="$(cat || true)"
|
|
9
10
|
agent_id="$(relay_json_string_field agent_id "$payload")"
|
|
@@ -4,6 +4,7 @@ set -euo pipefail
|
|
|
4
4
|
PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
|
|
5
5
|
# shellcheck source=/dev/null
|
|
6
6
|
source "${PLUGIN_ROOT}/hooks/relay-status.sh"
|
|
7
|
+
relay_install_hook_guard subagent-stop
|
|
7
8
|
|
|
8
9
|
payload="$(cat || true)"
|
|
9
10
|
agent_id="$(relay_json_string_field agent_id "$payload")"
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env bash
|
|
2
2
|
set -euo pipefail
|
|
3
3
|
source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
|
|
4
|
+
relay_install_hook_guard user-prompt-submit
|
|
4
5
|
payload="$(cat || true)"
|
|
5
6
|
relay_post_status busy
|
|
6
7
|
# Mirror a terminal/TUI-typed prompt into the dashboard chat and start reasoning
|
package/src/adapters/codex.ts
CHANGED
|
@@ -4,6 +4,7 @@ import { basename, join, resolve } from "node:path";
|
|
|
4
4
|
import type { ContextState, Message } from "agent-relay-sdk";
|
|
5
5
|
import { profileAllowsRelayFeature, providerMessageText, RELAY_CONTEXT, type ManagedProcess, type ProviderAdapter, type ProviderConfig, type ProviderPermissionDecisionInput, type ProviderSessionEvent, type ProviderStatusUpdate, type RunnerSpawnConfig, type SpawnArgs, type TerminalAttachSpec } from "../adapter";
|
|
6
6
|
import { workspaceDepsNoteFromEnv } from "../relay-instructions";
|
|
7
|
+
import { logger } from "../logger";
|
|
7
8
|
|
|
8
9
|
/** Relay context prepended to a Codex agent's first turn: the standard relay
|
|
9
10
|
* blurb plus, when running in an isolated workspace, the deps caveat (#159). */
|
|
@@ -199,7 +200,7 @@ export class CodexAdapter implements ProviderAdapter {
|
|
|
199
200
|
input = codexRelayContextBlock() + "\n\n" + input;
|
|
200
201
|
process.meta = { ...(process.meta ?? {}), relayContextSent: true };
|
|
201
202
|
}
|
|
202
|
-
|
|
203
|
+
logger.info("codex", `starting Codex initial prompt in thread ${threadId}`);
|
|
203
204
|
const client = process.meta?.client as CodexAppClient;
|
|
204
205
|
await client.turnStart(threadId, input);
|
|
205
206
|
}
|
|
@@ -211,7 +212,7 @@ export class CodexAdapter implements ProviderAdapter {
|
|
|
211
212
|
text = codexRelayContextBlock() + "\n\n" + text;
|
|
212
213
|
process.meta = { ...(process.meta ?? {}), relayContextSent: true };
|
|
213
214
|
}
|
|
214
|
-
|
|
215
|
+
logger.info("codex", codexDeliveryNotice(messages, threadId));
|
|
215
216
|
const client = process.meta?.client as CodexAppClient;
|
|
216
217
|
await client.turnStart(threadId, text);
|
|
217
218
|
}
|
package/src/control-server.ts
CHANGED
|
@@ -1,6 +1,16 @@
|
|
|
1
1
|
import type { Server, ServerWebSocket } from "bun";
|
|
2
2
|
import type { Message, ReplyObligation } from "agent-relay-sdk";
|
|
3
3
|
import type { ProviderPermissionDecisionInput, ProviderStatusEvent, SemanticStatus, TerminalAttachSpec } from "./adapter";
|
|
4
|
+
import { logger, parseLogLevel, LOG_LEVELS } from "./logger";
|
|
5
|
+
|
|
6
|
+
// A hook that failed in a way it could not handle itself reports here so the
|
|
7
|
+
// failure is never silent (#198 item 5). Phase 1 logs it FATAL to the per-agent
|
|
8
|
+
// log; Phase 2 (#196) will additionally route it through the runner outbox to the
|
|
9
|
+
// server.
|
|
10
|
+
export interface HookFatalReport {
|
|
11
|
+
hook: string;
|
|
12
|
+
error: string;
|
|
13
|
+
}
|
|
4
14
|
|
|
5
15
|
interface MonitorSocketData {
|
|
6
16
|
kind: "monitor";
|
|
@@ -33,6 +43,10 @@ interface ControlServerOptions {
|
|
|
33
43
|
// transcript. transcriptPath is optional — the runner falls back to the last
|
|
34
44
|
// path it saw during the session.
|
|
35
45
|
onSessionEnd?(input: { reason?: string; transcriptPath?: string }): Promise<void>;
|
|
46
|
+
// Phase 1 observability (#198): a hook reporting an unhandled failure. The
|
|
47
|
+
// control server already logs it FATAL; this is the seam for Phase 2 to also
|
|
48
|
+
// surface it to the server via the runner outbox.
|
|
49
|
+
onHookFatal?(report: HookFatalReport): void;
|
|
36
50
|
}
|
|
37
51
|
|
|
38
52
|
export function startControlServer(options: ControlServerOptions): ControlServer {
|
|
@@ -81,6 +95,15 @@ export function startControlServer(options: ControlServerOptions): ControlServer
|
|
|
81
95
|
if (url.pathname === "/session-end" && req.method === "POST") {
|
|
82
96
|
return handleSessionEnd(req, options);
|
|
83
97
|
}
|
|
98
|
+
if (url.pathname === "/log-level" && req.method === "GET") {
|
|
99
|
+
return Response.json({ level: logger.getLevel(), levels: LOG_LEVELS });
|
|
100
|
+
}
|
|
101
|
+
if (url.pathname === "/log-level" && req.method === "POST") {
|
|
102
|
+
return handleLogLevel(req);
|
|
103
|
+
}
|
|
104
|
+
if (url.pathname === "/hook-fatal" && req.method === "POST") {
|
|
105
|
+
return handleHookFatal(req, options);
|
|
106
|
+
}
|
|
84
107
|
if (url.pathname === "/monitor") {
|
|
85
108
|
const upgraded = srv.upgrade(req, { data: { kind: "monitor" } });
|
|
86
109
|
return upgraded ? undefined : new Response("WebSocket upgrade failed", { status: 400 });
|
|
@@ -361,6 +384,26 @@ async function handleSessionEnd(req: Request, options: ControlServerOptions): Pr
|
|
|
361
384
|
return Response.json({ ok: true });
|
|
362
385
|
}
|
|
363
386
|
|
|
387
|
+
async function handleLogLevel(req: Request): Promise<Response> {
|
|
388
|
+
const body = await req.json().catch(() => null);
|
|
389
|
+
const level = parseLogLevel(isRecord(body) && typeof body.level === "string" ? body.level : undefined);
|
|
390
|
+
if (!level) return Response.json({ error: `level must be one of: ${LOG_LEVELS.join(", ")}` }, { status: 400 });
|
|
391
|
+
const previous = logger.getLevel();
|
|
392
|
+
logger.setLevel(level);
|
|
393
|
+
logger.info("logger", `log level set to ${level} (was ${previous}) via control port`);
|
|
394
|
+
return Response.json({ ok: true, level, previous });
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
async function handleHookFatal(req: Request, options: ControlServerOptions): Promise<Response> {
|
|
398
|
+
const body = await req.json().catch(() => null);
|
|
399
|
+
const hook = isRecord(body) && typeof body.hook === "string" && body.hook.trim() ? body.hook.trim() : "unknown";
|
|
400
|
+
const error = isRecord(body) && typeof body.error === "string" ? body.error : "(no detail)";
|
|
401
|
+
// Never silent: a hook that couldn't handle its own failure lands here as FATAL.
|
|
402
|
+
logger.fatal(`hook:${hook}`, error);
|
|
403
|
+
try { options.onHookFatal?.({ hook, error }); } catch { /* reporting must never throw back at the hook */ }
|
|
404
|
+
return Response.json({ ok: true });
|
|
405
|
+
}
|
|
406
|
+
|
|
364
407
|
async function handleStatus(req: Request, options: ControlServerOptions): Promise<Response> {
|
|
365
408
|
const body = await req.json().catch(() => null) as Partial<ProviderStatusEvent> | null;
|
|
366
409
|
const status = body?.status;
|
package/src/logger.ts
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { appendFileSync, mkdirSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
|
|
4
|
+
// Phase 1 observability (#198): one leveled, runtime-togglable logger for the
|
|
5
|
+
// Runner and the provider adapters below it. Replaces the ad-hoc scatter of
|
|
6
|
+
// `console.error`, `logRunnerDiagnostic` (-> runner-<agent>.log) and
|
|
7
|
+
// `sessionLog`/`sessionDebug` (-> session-mirror-<agent>.log) with a single
|
|
8
|
+
// switch and a single greppable, ANSI-free sink.
|
|
9
|
+
//
|
|
10
|
+
// Sink: the per-agent `session-mirror-<agent>.log` — the file the orchestrator
|
|
11
|
+
// already surfaces to the dashboard log-viewer (captureSessionMirror). One place
|
|
12
|
+
// to look when anything in the Runner misbehaves.
|
|
13
|
+
//
|
|
14
|
+
// Level is read once from AGENT_RELAY_LOG_LEVEL (default "info") and can be
|
|
15
|
+
// flipped at runtime via the control port (no restart) — so a phase refactor can
|
|
16
|
+
// be watched at debug without bouncing the agent.
|
|
17
|
+
|
|
18
|
+
export type LogLevel = "debug" | "info" | "warn" | "error" | "fatal";
|
|
19
|
+
|
|
20
|
+
const ORDER: Record<LogLevel, number> = { debug: 10, info: 20, warn: 30, error: 40, fatal: 50 };
|
|
21
|
+
export const LOG_LEVELS = Object.keys(ORDER) as LogLevel[];
|
|
22
|
+
|
|
23
|
+
export function parseLogLevel(value: string | undefined | null): LogLevel | undefined {
|
|
24
|
+
if (!value) return undefined;
|
|
25
|
+
const v = value.trim().toLowerCase();
|
|
26
|
+
return (LOG_LEVELS as string[]).includes(v) ? (v as LogLevel) : undefined;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Matches the runner's safeLogName and the orchestrator's safeMirrorLogName so all
|
|
30
|
+
// three resolve the identical filename for a given agent id.
|
|
31
|
+
function safeLogName(value: string): string {
|
|
32
|
+
return value.replace(/[^a-zA-Z0-9_.-]+/g, "_").slice(0, 180);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export interface LoggerConfig {
|
|
36
|
+
agentId?: string;
|
|
37
|
+
level?: LogLevel;
|
|
38
|
+
headless?: boolean;
|
|
39
|
+
logDir?: string;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export class Logger {
|
|
43
|
+
private level: LogLevel;
|
|
44
|
+
private agentId: string;
|
|
45
|
+
private headless: boolean;
|
|
46
|
+
private logDir: string;
|
|
47
|
+
|
|
48
|
+
constructor(config: LoggerConfig = {}) {
|
|
49
|
+
this.level = config.level ?? parseLogLevel(process.env.AGENT_RELAY_LOG_LEVEL) ?? "info";
|
|
50
|
+
this.agentId = config.agentId ?? "runner";
|
|
51
|
+
this.headless = config.headless ?? false;
|
|
52
|
+
this.logDir = config.logDir ?? join(process.env.HOME || ".", ".agent-relay", "logs");
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Bind the logger to a concrete agent once the runner knows its id. Preserves a
|
|
56
|
+
// level already set via env/runtime unless an explicit level is passed.
|
|
57
|
+
configure(config: LoggerConfig): void {
|
|
58
|
+
if (config.agentId !== undefined) this.agentId = config.agentId;
|
|
59
|
+
if (config.headless !== undefined) this.headless = config.headless;
|
|
60
|
+
if (config.logDir !== undefined) this.logDir = config.logDir;
|
|
61
|
+
if (config.level !== undefined) this.level = config.level;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
setLevel(level: LogLevel): void { this.level = level; }
|
|
65
|
+
getLevel(): LogLevel { return this.level; }
|
|
66
|
+
isEnabled(level: LogLevel): boolean { return ORDER[level] >= ORDER[this.level]; }
|
|
67
|
+
|
|
68
|
+
debug(component: string, message: string): void { this.log("debug", component, message); }
|
|
69
|
+
info(component: string, message: string): void { this.log("info", component, message); }
|
|
70
|
+
warn(component: string, message: string): void { this.log("warn", component, message); }
|
|
71
|
+
error(component: string, message: string): void { this.log("error", component, message); }
|
|
72
|
+
fatal(component: string, message: string): void { this.log("fatal", component, message); }
|
|
73
|
+
|
|
74
|
+
log(level: LogLevel, component: string, message: string): void {
|
|
75
|
+
if (!this.isEnabled(level)) return;
|
|
76
|
+
const line = `[${new Date().toISOString()}] ${level.toUpperCase().padEnd(5)} [${component}] ${oneLine(message)}\n`;
|
|
77
|
+
try {
|
|
78
|
+
mkdirSync(this.logDir, { recursive: true });
|
|
79
|
+
appendFileSync(join(this.logDir, `session-mirror-${safeLogName(this.agentId)}.log`), line);
|
|
80
|
+
} catch {
|
|
81
|
+
// Best-effort. If the per-agent file can't be written, surface error/fatal to
|
|
82
|
+
// stderr so it is not lost entirely (headless: lands in the orchestrator log).
|
|
83
|
+
if (ORDER[level] >= ORDER.error) { try { console.error(line.trimEnd()); } catch { /* give up */ } }
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Newlines would split one record across several log lines and break greppability;
|
|
89
|
+
// collapse them so a multi-line message stays one line.
|
|
90
|
+
function oneLine(message: string): string {
|
|
91
|
+
return message.replace(/\r?\n/g, " ⏎ ");
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Process-global logger. A runner process serves exactly one agent, so a singleton
|
|
95
|
+
// is the right scope; the runner calls configure() once it knows its id, and
|
|
96
|
+
// adapters import this instance directly (no constructor threading).
|
|
97
|
+
export const logger = new Logger();
|
package/src/outbox.ts
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
import { Database } from "bun:sqlite";
|
|
2
|
+
import { mkdirSync } from "node:fs";
|
|
3
|
+
import { dirname, join } from "node:path";
|
|
4
|
+
import { tmpdir } from "node:os";
|
|
5
|
+
import { logger } from "./logger";
|
|
6
|
+
|
|
7
|
+
// Phase 2 (#196) — the "nothing is ever lost" half. Runner→server events that used to be
|
|
8
|
+
// fire-and-forget over HTTP (session turns, reasoning/tool traces, prompt echoes, insights,
|
|
9
|
+
// hook-fatal reports) were silently dropped whenever the server was momentarily down. This
|
|
10
|
+
// is a durable, FIFO, disk-backed queue that:
|
|
11
|
+
// - survives Runner/server restart (bun:sqlite file in the runtime dir),
|
|
12
|
+
// - stamps true event time (`occurredAt`) once at enqueue and preserves it through retries,
|
|
13
|
+
// - retries with capped exponential backoff, strictly in order (an append log must not
|
|
14
|
+
// reorder turns),
|
|
15
|
+
// - poisons a permanently-failing head after maxAttempts so it can't block the queue,
|
|
16
|
+
// - is bounded with a logged drop policy (never silently truncates).
|
|
17
|
+
//
|
|
18
|
+
// Status deliberately does NOT go through here: it rides the WebSocket bus, which is
|
|
19
|
+
// last-wins and self-heals on reconnect (so it already satisfies "coalesce, don't replay
|
|
20
|
+
// stale busyes"). The coalesce mode below exists so a future state event could migrate here.
|
|
21
|
+
|
|
22
|
+
export type OutboxMode = "append" | "coalesce";
|
|
23
|
+
|
|
24
|
+
export interface OutboxEventInput {
|
|
25
|
+
kind: string;
|
|
26
|
+
payload: unknown;
|
|
27
|
+
mode?: OutboxMode;
|
|
28
|
+
// Required for coalesce mode: prior un-poisoned rows with the same dedupeKey are replaced.
|
|
29
|
+
dedupeKey?: string;
|
|
30
|
+
// Defaults to now. Set explicitly only to backdate (e.g. replaying a captured timestamp).
|
|
31
|
+
occurredAt?: number;
|
|
32
|
+
// Defaults to a stable derived key so server-side dedup makes retries exactly-once.
|
|
33
|
+
idempotencyKey?: string;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export interface OutboxRecord {
|
|
37
|
+
seq: number;
|
|
38
|
+
kind: string;
|
|
39
|
+
mode: OutboxMode;
|
|
40
|
+
occurredAt: number;
|
|
41
|
+
idempotencyKey: string;
|
|
42
|
+
payload: unknown;
|
|
43
|
+
attempts: number;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// The transport. Resolve = delivered (row deleted). Reject = failed (retried with backoff).
|
|
47
|
+
export type OutboxSend = (record: OutboxRecord) => Promise<void>;
|
|
48
|
+
|
|
49
|
+
export interface OutboxOptions {
|
|
50
|
+
agentId: string;
|
|
51
|
+
send: OutboxSend;
|
|
52
|
+
// Storage directory. Defaults to AGENT_RELAY_RUNNER_OUTBOX_DIR, else a per-host temp dir.
|
|
53
|
+
dir?: string;
|
|
54
|
+
maxRows?: number;
|
|
55
|
+
maxAttempts?: number;
|
|
56
|
+
baseBackoffMs?: number;
|
|
57
|
+
maxBackoffMs?: number;
|
|
58
|
+
pollMs?: number;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const DEFAULTS = {
|
|
62
|
+
maxRows: 5000,
|
|
63
|
+
maxAttempts: 12,
|
|
64
|
+
baseBackoffMs: 1_000,
|
|
65
|
+
maxBackoffMs: 60_000,
|
|
66
|
+
pollMs: 5_000,
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
interface Row {
|
|
70
|
+
seq: number;
|
|
71
|
+
kind: string;
|
|
72
|
+
mode: string;
|
|
73
|
+
occurred_at: number;
|
|
74
|
+
idempotency_key: string;
|
|
75
|
+
payload: string;
|
|
76
|
+
attempts: number;
|
|
77
|
+
next_attempt_at: number;
|
|
78
|
+
poisoned: number;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
export class Outbox {
|
|
82
|
+
private readonly db: Database;
|
|
83
|
+
private readonly agentId: string;
|
|
84
|
+
private readonly send: OutboxSend;
|
|
85
|
+
private readonly maxRows: number;
|
|
86
|
+
private readonly maxAttempts: number;
|
|
87
|
+
private readonly baseBackoffMs: number;
|
|
88
|
+
private readonly maxBackoffMs: number;
|
|
89
|
+
private readonly pollMs: number;
|
|
90
|
+
readonly path: string;
|
|
91
|
+
|
|
92
|
+
private draining = false;
|
|
93
|
+
private rerun = false;
|
|
94
|
+
private pollTimer?: ReturnType<typeof setInterval>;
|
|
95
|
+
private dueTimer?: ReturnType<typeof setTimeout>;
|
|
96
|
+
private stopped = false;
|
|
97
|
+
|
|
98
|
+
constructor(options: OutboxOptions) {
|
|
99
|
+
this.agentId = options.agentId;
|
|
100
|
+
this.send = options.send;
|
|
101
|
+
this.maxRows = options.maxRows ?? DEFAULTS.maxRows;
|
|
102
|
+
this.maxAttempts = options.maxAttempts ?? DEFAULTS.maxAttempts;
|
|
103
|
+
this.baseBackoffMs = options.baseBackoffMs ?? DEFAULTS.baseBackoffMs;
|
|
104
|
+
this.maxBackoffMs = options.maxBackoffMs ?? DEFAULTS.maxBackoffMs;
|
|
105
|
+
this.pollMs = options.pollMs ?? DEFAULTS.pollMs;
|
|
106
|
+
|
|
107
|
+
const dir = options.dir ?? process.env.AGENT_RELAY_RUNNER_OUTBOX_DIR ?? join(tmpdir(), "agent-relay-outbox");
|
|
108
|
+
this.path = options.dir === ":memory:" ? ":memory:" : join(dir, `outbox-${safeName(this.agentId)}.sqlite`);
|
|
109
|
+
if (this.path !== ":memory:") mkdirSync(dirname(this.path), { recursive: true });
|
|
110
|
+
|
|
111
|
+
this.db = new Database(this.path, { create: true });
|
|
112
|
+
this.db.exec("PRAGMA journal_mode = WAL");
|
|
113
|
+
this.db.exec("PRAGMA busy_timeout = 2000");
|
|
114
|
+
this.db.exec(`
|
|
115
|
+
CREATE TABLE IF NOT EXISTS outbox (
|
|
116
|
+
seq INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
117
|
+
kind TEXT NOT NULL,
|
|
118
|
+
mode TEXT NOT NULL DEFAULT 'append',
|
|
119
|
+
dedupe_key TEXT,
|
|
120
|
+
occurred_at INTEGER NOT NULL,
|
|
121
|
+
idempotency_key TEXT NOT NULL,
|
|
122
|
+
payload TEXT NOT NULL,
|
|
123
|
+
attempts INTEGER NOT NULL DEFAULT 0,
|
|
124
|
+
next_attempt_at INTEGER NOT NULL DEFAULT 0,
|
|
125
|
+
poisoned INTEGER NOT NULL DEFAULT 0,
|
|
126
|
+
created_at INTEGER NOT NULL
|
|
127
|
+
)
|
|
128
|
+
`);
|
|
129
|
+
// A restart is a fresh start: clear any backoff timers left by the prior process so
|
|
130
|
+
// pending events get an immediate retry (the down server may now be back). `attempts`
|
|
131
|
+
// is kept so the poison threshold still counts cumulative failures.
|
|
132
|
+
this.db.exec("UPDATE outbox SET next_attempt_at = 0 WHERE next_attempt_at > 0");
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// Persist an event. Returns the assigned seq. Triggers a drain.
|
|
136
|
+
enqueue(input: OutboxEventInput): number {
|
|
137
|
+
if (this.stopped) throw new Error("outbox is stopped");
|
|
138
|
+
const mode: OutboxMode = input.mode ?? "append";
|
|
139
|
+
const occurredAt = input.occurredAt ?? Date.now();
|
|
140
|
+
const payloadJson = JSON.stringify(input.payload ?? null);
|
|
141
|
+
const idempotencyKey = input.idempotencyKey ?? `${this.agentId}:${input.kind}:${occurredAt}:${shortHash(payloadJson)}`;
|
|
142
|
+
|
|
143
|
+
if (mode === "coalesce") {
|
|
144
|
+
if (!input.dedupeKey) throw new Error("coalesce mode requires a dedupeKey");
|
|
145
|
+
this.db.query("DELETE FROM outbox WHERE dedupe_key = ? AND poisoned = 0").run(input.dedupeKey);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
const info = this.db
|
|
149
|
+
.query(`INSERT INTO outbox (kind, mode, dedupe_key, occurred_at, idempotency_key, payload, created_at)
|
|
150
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)`)
|
|
151
|
+
.run(input.kind, mode, input.dedupeKey ?? null, occurredAt, idempotencyKey, payloadJson, Date.now());
|
|
152
|
+
const seq = Number(info.lastInsertRowid);
|
|
153
|
+
|
|
154
|
+
this.enforceBound();
|
|
155
|
+
// Defer the drain to a microtask so a synchronous burst of enqueues (e.g. several
|
|
156
|
+
// coalesce updates) all land — and coalesce — before the pump pulls the head.
|
|
157
|
+
queueMicrotask(() => { void this.drain(); });
|
|
158
|
+
return seq;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// Bounded ring buffer: if over capacity, drop the oldest rows (defined overflow policy).
|
|
162
|
+
// Logged, never silent. Prefers dropping already-poisoned rows first, then oldest by seq.
|
|
163
|
+
private enforceBound(): void {
|
|
164
|
+
const { n } = this.db.query("SELECT count(*) AS n FROM outbox").get() as { n: number };
|
|
165
|
+
if (n <= this.maxRows) return;
|
|
166
|
+
const overflow = n - this.maxRows;
|
|
167
|
+
// Oldest poisoned first, then oldest live — both by seq.
|
|
168
|
+
const victims = this.db
|
|
169
|
+
.query("SELECT seq FROM outbox ORDER BY poisoned DESC, seq ASC LIMIT ?")
|
|
170
|
+
.all(overflow) as Array<{ seq: number }>;
|
|
171
|
+
const ids = victims.map((v) => v.seq);
|
|
172
|
+
if (ids.length === 0) return;
|
|
173
|
+
const placeholders = ids.map(() => "?").join(",");
|
|
174
|
+
this.db.query(`DELETE FROM outbox WHERE seq IN (${placeholders})`).run(...ids);
|
|
175
|
+
logger.warn("outbox", `bound exceeded (${n}/${this.maxRows}) — dropped ${ids.length} oldest event(s)`);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// Begin the background pump: an initial drain plus a poll timer as a backstop.
|
|
179
|
+
start(): void {
|
|
180
|
+
if (this.pollTimer || this.stopped) return;
|
|
181
|
+
void this.drain();
|
|
182
|
+
this.pollTimer = setInterval(() => { void this.drain(); }, this.pollMs);
|
|
183
|
+
this.pollTimer.unref?.();
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Process the queue strictly oldest-first. Coalesces concurrent calls; if a drain is
|
|
187
|
+
// requested while one is running, it re-runs once at the end (so an enqueue during a
|
|
188
|
+
// send isn't missed).
|
|
189
|
+
async drain(): Promise<void> {
|
|
190
|
+
if (this.stopped) return;
|
|
191
|
+
if (this.draining) { this.rerun = true; return; }
|
|
192
|
+
this.draining = true;
|
|
193
|
+
try {
|
|
194
|
+
do {
|
|
195
|
+
this.rerun = false;
|
|
196
|
+
await this.drainOnce();
|
|
197
|
+
} while (this.rerun && !this.stopped);
|
|
198
|
+
} finally {
|
|
199
|
+
this.draining = false;
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
private async drainOnce(): Promise<void> {
|
|
204
|
+
for (;;) {
|
|
205
|
+
if (this.stopped) return;
|
|
206
|
+
const row = this.db
|
|
207
|
+
.query("SELECT * FROM outbox WHERE poisoned = 0 ORDER BY seq ASC LIMIT 1")
|
|
208
|
+
.get() as Row | null;
|
|
209
|
+
if (!row) return;
|
|
210
|
+
|
|
211
|
+
const now = Date.now();
|
|
212
|
+
if (row.next_attempt_at > now) {
|
|
213
|
+
// Head isn't due yet. Don't reorder past it (FIFO) — schedule a wake-up and stop.
|
|
214
|
+
this.scheduleDue(row.next_attempt_at - now);
|
|
215
|
+
return;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
const record: OutboxRecord = {
|
|
219
|
+
seq: row.seq,
|
|
220
|
+
kind: row.kind,
|
|
221
|
+
mode: row.mode as OutboxMode,
|
|
222
|
+
occurredAt: row.occurred_at,
|
|
223
|
+
idempotencyKey: row.idempotency_key,
|
|
224
|
+
payload: safeParse(row.payload),
|
|
225
|
+
attempts: row.attempts,
|
|
226
|
+
};
|
|
227
|
+
|
|
228
|
+
try {
|
|
229
|
+
await this.send(record);
|
|
230
|
+
this.db.query("DELETE FROM outbox WHERE seq = ?").run(row.seq);
|
|
231
|
+
} catch (error) {
|
|
232
|
+
const attempts = row.attempts + 1;
|
|
233
|
+
const reason = error instanceof Error ? error.message : String(error);
|
|
234
|
+
if (attempts >= this.maxAttempts) {
|
|
235
|
+
this.db.query("UPDATE outbox SET attempts = ?, poisoned = 1 WHERE seq = ?").run(attempts, row.seq);
|
|
236
|
+
logger.fatal("outbox", `event seq=${row.seq} kind=${row.kind} poisoned after ${attempts} attempts: ${reason}`);
|
|
237
|
+
// Move on — the next iteration picks the new head (poison no longer blocks).
|
|
238
|
+
continue;
|
|
239
|
+
}
|
|
240
|
+
const delay = this.backoff(attempts);
|
|
241
|
+
this.db.query("UPDATE outbox SET attempts = ?, next_attempt_at = ? WHERE seq = ?").run(attempts, now + delay, row.seq);
|
|
242
|
+
logger.debug("outbox", `event seq=${row.seq} kind=${row.kind} retry ${attempts}/${this.maxAttempts} in ${delay}ms: ${reason}`);
|
|
243
|
+
this.scheduleDue(delay);
|
|
244
|
+
return; // head is now scheduled; stop until it's due (preserve order)
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
private backoff(attempts: number): number {
|
|
250
|
+
const exp = Math.min(this.maxBackoffMs, this.baseBackoffMs * 2 ** (attempts - 1));
|
|
251
|
+
return Math.round(exp / 2 + Math.random() * (exp / 2)); // full-ish jitter, never below half
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
private scheduleDue(delayMs: number): void {
|
|
255
|
+
if (this.stopped || this.dueTimer) return;
|
|
256
|
+
this.dueTimer = setTimeout(() => {
|
|
257
|
+
this.dueTimer = undefined;
|
|
258
|
+
void this.drain();
|
|
259
|
+
}, Math.max(0, delayMs));
|
|
260
|
+
this.dueTimer.unref?.();
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
// Observability / tests.
|
|
264
|
+
pendingCount(): number {
|
|
265
|
+
return (this.db.query("SELECT count(*) AS n FROM outbox WHERE poisoned = 0").get() as { n: number }).n;
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
poisonedCount(): number {
|
|
269
|
+
return (this.db.query("SELECT count(*) AS n FROM outbox WHERE poisoned = 1").get() as { n: number }).n;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
stop(): void {
|
|
273
|
+
this.stopped = true;
|
|
274
|
+
if (this.pollTimer) clearInterval(this.pollTimer);
|
|
275
|
+
this.pollTimer = undefined;
|
|
276
|
+
if (this.dueTimer) clearTimeout(this.dueTimer);
|
|
277
|
+
this.dueTimer = undefined;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
close(): void {
|
|
281
|
+
this.stop();
|
|
282
|
+
try { this.db.close(); } catch { /* already closed */ }
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
function safeName(value: string): string {
|
|
287
|
+
return value.replace(/[^a-zA-Z0-9_.-]+/g, "_").slice(0, 180) || "agent";
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
function safeParse(json: string): unknown {
|
|
291
|
+
try { return JSON.parse(json); } catch { return null; }
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// Small, fast, stable string hash (FNV-1a, 32-bit) — enough to disambiguate identical
|
|
295
|
+
// kind+timestamp payloads in the idempotency key. Not security-sensitive.
|
|
296
|
+
function shortHash(value: string): string {
|
|
297
|
+
let h = 0x811c9dc5;
|
|
298
|
+
for (let i = 0; i < value.length; i++) {
|
|
299
|
+
h ^= value.charCodeAt(i);
|
|
300
|
+
h = Math.imul(h, 0x01000193);
|
|
301
|
+
}
|
|
302
|
+
return (h >>> 0).toString(36);
|
|
303
|
+
}
|
|
@@ -45,15 +45,29 @@ export function workspaceDepsNote(input: { mode?: string | null; depsMode?: stri
|
|
|
45
45
|
}
|
|
46
46
|
}
|
|
47
47
|
|
|
48
|
-
/**
|
|
48
|
+
/**
|
|
49
|
+
* Caveat for untracked paths symlinked from main into an isolated worktree
|
|
50
|
+
* (WorkspaceConfig.symlinkPaths, e.g. AGENTS.md, .claude-rig). Edits to these
|
|
51
|
+
* write THROUGH to the main checkout — the agent must know so it doesn't mutate
|
|
52
|
+
* shared config thinking it's worktree-local. Returns "" when nothing was linked.
|
|
53
|
+
*/
|
|
54
|
+
export function workspaceSymlinksNote(linked: string[]): string {
|
|
55
|
+
if (!linked.length) return "";
|
|
56
|
+
return `[agent-relay] Isolated workspace: these untracked paths are SYMLINKED from the main checkout: ${linked.join(", ")}. They resolve to the real files in main, so editing or deleting them writes THROUGH to main — treat them as read-only unless you intend to change main.`;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/** Resolve the workspace caveats from the runner/monitor environment.
|
|
49
60
|
* AGENT_RELAY_WORKSPACE_JSON carries the resolved workspace metadata (mode +
|
|
50
|
-
* deps) and is the authoritative source. Best-effort: never throws. */
|
|
61
|
+
* deps + symlinks) and is the authoritative source. Best-effort: never throws. */
|
|
51
62
|
export function workspaceDepsNoteFromEnv(env: Record<string, string | undefined> = process.env): string {
|
|
52
63
|
const json = env.AGENT_RELAY_WORKSPACE_JSON;
|
|
53
64
|
if (!json) return "";
|
|
54
65
|
try {
|
|
55
|
-
const parsed = JSON.parse(json) as { mode?: string; deps?: { mode?: string } };
|
|
56
|
-
return
|
|
66
|
+
const parsed = JSON.parse(json) as { mode?: string; deps?: { mode?: string }; symlinks?: { linked?: string[] } };
|
|
67
|
+
return [
|
|
68
|
+
workspaceDepsNote({ mode: parsed.mode ?? null, depsMode: parsed.deps?.mode ?? null }),
|
|
69
|
+
parsed.mode === "isolated" ? workspaceSymlinksNote(parsed.symlinks?.linked ?? []) : "",
|
|
70
|
+
].filter(Boolean).join("\n\n");
|
|
57
71
|
} catch {
|
|
58
72
|
return "";
|
|
59
73
|
}
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import type { ReplyObligation } from "agent-relay-sdk";
|
|
2
|
+
import { logger } from "./logger";
|
|
3
|
+
|
|
4
|
+
// Phase 2 (#196) — the crux. The Claude Stop hook used to ask the server, synchronously
|
|
5
|
+
// and in the hot path, "does this agent owe a reply?" before clearing the turn. A slow
|
|
6
|
+
// server answer (the unindexed reply_to scan, #199) blew the hook's timeout and wedged the
|
|
7
|
+
// agent in `busy` forever. The fix: the hook asks the Runner, the Runner answers instantly
|
|
8
|
+
// from this local snapshot, and the snapshot is refreshed from the server only in the
|
|
9
|
+
// background — never on the path that ends a turn.
|
|
10
|
+
//
|
|
11
|
+
// Design rules:
|
|
12
|
+
// - `get()` is synchronous, never throws, never touches the network.
|
|
13
|
+
// - `refresh()` is the only thing that talks to the server; it coalesces concurrent calls
|
|
14
|
+
// and, on failure, keeps the last-known snapshot (stale-but-serving beats blocking).
|
|
15
|
+
// - A background interval keeps the snapshot warm; `markDirty()` requests an extra,
|
|
16
|
+
// debounced refresh when state likely just changed (a message arrived, a turn ended).
|
|
17
|
+
|
|
18
|
+
export type ReplyObligationFetch = () => Promise<ReplyObligation[]>;
|
|
19
|
+
|
|
20
|
+
export interface ReplyObligationCacheOptions {
|
|
21
|
+
fetch: ReplyObligationFetch;
|
|
22
|
+
// Background freshness backstop. Default 10s — well under any turn cadence, cheap.
|
|
23
|
+
intervalMs?: number;
|
|
24
|
+
// Debounce window for markDirty()-triggered refreshes so a burst of events
|
|
25
|
+
// (e.g. a fan-out of messages) collapses into one server round-trip.
|
|
26
|
+
dirtyDebounceMs?: number;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
const DEFAULT_INTERVAL_MS = 10_000;
|
|
30
|
+
const DEFAULT_DIRTY_DEBOUNCE_MS = 400;
|
|
31
|
+
|
|
32
|
+
export class ReplyObligationCache {
|
|
33
|
+
private readonly fetch: ReplyObligationFetch;
|
|
34
|
+
private readonly intervalMs: number;
|
|
35
|
+
private readonly dirtyDebounceMs: number;
|
|
36
|
+
|
|
37
|
+
private snapshot: ReplyObligation[] = [];
|
|
38
|
+
private lastRefreshedAt = 0;
|
|
39
|
+
private inFlight: Promise<void> | null = null;
|
|
40
|
+
private intervalTimer?: ReturnType<typeof setInterval>;
|
|
41
|
+
private dirtyTimer?: ReturnType<typeof setTimeout>;
|
|
42
|
+
private stopped = false;
|
|
43
|
+
|
|
44
|
+
constructor(options: ReplyObligationCacheOptions) {
|
|
45
|
+
this.fetch = options.fetch;
|
|
46
|
+
this.intervalMs = options.intervalMs ?? DEFAULT_INTERVAL_MS;
|
|
47
|
+
this.dirtyDebounceMs = options.dirtyDebounceMs ?? DEFAULT_DIRTY_DEBOUNCE_MS;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
// Synchronous, hot-path-safe read. Returns a copy so callers can't mutate the snapshot.
|
|
51
|
+
get(): ReplyObligation[] {
|
|
52
|
+
return this.snapshot.slice();
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
getLastRefreshedAt(): number {
|
|
56
|
+
return this.lastRefreshedAt;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Begin the background freshness loop and prime the first snapshot immediately.
|
|
60
|
+
start(): void {
|
|
61
|
+
if (this.intervalTimer || this.stopped) return;
|
|
62
|
+
void this.refresh();
|
|
63
|
+
this.intervalTimer = setInterval(() => { void this.refresh(); }, this.intervalMs);
|
|
64
|
+
// Don't keep the process alive solely for cache refreshes.
|
|
65
|
+
this.intervalTimer.unref?.();
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
stop(): void {
|
|
69
|
+
this.stopped = true;
|
|
70
|
+
if (this.intervalTimer) clearInterval(this.intervalTimer);
|
|
71
|
+
this.intervalTimer = undefined;
|
|
72
|
+
if (this.dirtyTimer) clearTimeout(this.dirtyTimer);
|
|
73
|
+
this.dirtyTimer = undefined;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Request a refresh because state likely changed (message arrived / turn ended).
|
|
77
|
+
// Debounced so a burst collapses into a single server round-trip.
|
|
78
|
+
markDirty(): void {
|
|
79
|
+
if (this.stopped || this.dirtyTimer) return;
|
|
80
|
+
this.dirtyTimer = setTimeout(() => {
|
|
81
|
+
this.dirtyTimer = undefined;
|
|
82
|
+
void this.refresh();
|
|
83
|
+
}, this.dirtyDebounceMs);
|
|
84
|
+
this.dirtyTimer.unref?.();
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Fetch from the server and replace the snapshot. Coalesces concurrent callers onto a
|
|
88
|
+
// single in-flight request. Never rejects — a failed fetch leaves the prior snapshot in
|
|
89
|
+
// place (the hook keeps getting an answer even while the server is down).
|
|
90
|
+
refresh(): Promise<void> {
|
|
91
|
+
if (this.stopped) return Promise.resolve();
|
|
92
|
+
if (this.inFlight) return this.inFlight;
|
|
93
|
+
this.inFlight = this.doRefresh().finally(() => { this.inFlight = null; });
|
|
94
|
+
return this.inFlight;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
private async doRefresh(): Promise<void> {
|
|
98
|
+
try {
|
|
99
|
+
const obligations = await this.fetch();
|
|
100
|
+
if (this.stopped) return;
|
|
101
|
+
this.snapshot = Array.isArray(obligations) ? obligations : [];
|
|
102
|
+
this.lastRefreshedAt = Date.now();
|
|
103
|
+
} catch (error) {
|
|
104
|
+
// Server-down is a non-event: keep serving the last snapshot. Debug, not error —
|
|
105
|
+
// this is expected during outages and must not spam the log.
|
|
106
|
+
logger.debug("obligation-cache", `refresh failed, serving cached snapshot (${this.snapshot.length}): ${error instanceof Error ? error.message : String(error)}`);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
package/src/runner.ts
CHANGED
|
@@ -1,18 +1,21 @@
|
|
|
1
1
|
import { hostname } from "node:os";
|
|
2
|
-
import {
|
|
2
|
+
import { closeSync, mkdirSync, openSync, readSync, statSync, writeFileSync } from "node:fs";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join } from "node:path";
|
|
5
|
-
import type { AgentProfile, ContextState, Message, MessageSessionMeta, ProviderCapabilities, TaskStatusInput, WorkspaceMetadata } from "agent-relay-sdk";
|
|
5
|
+
import type { AgentProfile, ContextState, Message, MessageSessionMeta, ProviderCapabilities, SendMessageInput, TaskStatusInput, WorkspaceMetadata } from "agent-relay-sdk";
|
|
6
6
|
import { RelayBusClient, RelayHttpClient } from "agent-relay-sdk";
|
|
7
7
|
import { contextStateFromProbeMetrics, readContextProbeState } from "agent-relay-sdk/context-probe";
|
|
8
8
|
import type { ManagedProcess, ProviderAdapter, ProviderConfig, ProviderPermissionDecision, ProviderPermissionDecisionInput, ProviderSessionEvent, ProviderStatusUpdate, RunnerSpawnConfig, SemanticStatus, TerminalAttachSpec } from "./adapter";
|
|
9
9
|
import { messagesWithCachedAttachments } from "./attachment-cache";
|
|
10
10
|
import { ClaimTracker } from "./claim-tracker";
|
|
11
11
|
import { startControlServer, type ControlServer } from "./control-server";
|
|
12
|
+
import { ReplyObligationCache } from "./reply-obligation-cache";
|
|
13
|
+
import { Outbox, type OutboxRecord } from "./outbox";
|
|
12
14
|
import { extractLastAssistantTurn, extractFinalAssistantMessage, extractHookAssistantMessage, extractLatestTurnSteps, transcriptLooksComplete, analyzeSession } from "./adapters/claude-transcript";
|
|
13
15
|
import { agentProfileProjectionReport } from "./profile-projection";
|
|
14
16
|
import { profileUsesHostProviderGlobals } from "./profile-home";
|
|
15
17
|
import { runtimeMetadata } from "./version";
|
|
18
|
+
import { logger, parseLogLevel } from "./logger";
|
|
16
19
|
import { ensureSessionScratch, reapSessionScratch, sweepStaleSessions, type SessionScratchLayout } from "./session-scratch";
|
|
17
20
|
|
|
18
21
|
interface RunnerOptions {
|
|
@@ -115,6 +118,13 @@ export class AgentRunner {
|
|
|
115
118
|
private readonly claims = new ClaimTracker();
|
|
116
119
|
private readonly http: RelayHttpClient;
|
|
117
120
|
private readonly bus: RelayBusClient;
|
|
121
|
+
// Phase 2 (#196): the Stop hook reads reply obligations from this local snapshot, never
|
|
122
|
+
// from the server — so a slow server can no longer wedge a turn (the crux fix).
|
|
123
|
+
private readonly obligationCache: ReplyObligationCache;
|
|
124
|
+
// Phase 2 (#196): Runner→server append-log events (session turns, reasoning, prompts,
|
|
125
|
+
// insights, hook-fatal) go through this durable, disk-backed, timestamped queue instead of
|
|
126
|
+
// direct fire-and-forget HTTP — so nothing is lost across a server/Runner restart.
|
|
127
|
+
private readonly outbox: Outbox;
|
|
118
128
|
private currentToken?: string;
|
|
119
129
|
private currentTokenJti?: string;
|
|
120
130
|
private currentTokenProfileId?: string;
|
|
@@ -177,12 +187,26 @@ export class AgentRunner {
|
|
|
177
187
|
|
|
178
188
|
constructor(private readonly options: RunnerOptions) {
|
|
179
189
|
this.agentId = options.agentId ?? options.runnerId;
|
|
190
|
+
// Bind the process-global logger to this agent. AGENT_RELAY_SESSION_DEBUG=1 is
|
|
191
|
+
// kept as a back-compat alias for the verbose probe/emit lines, now expressed
|
|
192
|
+
// as log level "debug" (AGENT_RELAY_LOG_LEVEL still wins when both are set).
|
|
193
|
+
logger.configure({
|
|
194
|
+
agentId: this.agentId,
|
|
195
|
+
headless: options.headless,
|
|
196
|
+
...(this.sessionDebugVerbose && !parseLogLevel(process.env.AGENT_RELAY_LOG_LEVEL) ? { level: "debug" as const } : {}),
|
|
197
|
+
});
|
|
180
198
|
this.currentToken = options.token;
|
|
181
199
|
this.currentTokenJti = options.tokenJti;
|
|
182
200
|
this.currentTokenProfileId = options.tokenProfileId;
|
|
183
201
|
this.currentTokenExpiresAt = options.tokenExpiresAt;
|
|
184
202
|
const runtime = runtimeMetadata(options.provider);
|
|
185
203
|
this.http = new RelayHttpClient({ baseUrl: options.relayUrl, token: this.currentToken });
|
|
204
|
+
this.obligationCache = new ReplyObligationCache({ fetch: () => this.http.listReplyObligations(this.agentId) });
|
|
205
|
+
// Co-locate the durable outbox with the runner's runtime state (survives reboot) when the
|
|
206
|
+
// orchestrator told us where that is; otherwise the Outbox falls back to a temp dir.
|
|
207
|
+
const outboxDir = process.env.AGENT_RELAY_RUNNER_OUTBOX_DIR
|
|
208
|
+
?? (process.env.AGENT_RELAY_RUNNER_INFO_FILE ? join(dirname(process.env.AGENT_RELAY_RUNNER_INFO_FILE), "outbox") : undefined);
|
|
209
|
+
this.outbox = new Outbox({ agentId: this.agentId, dir: outboxDir, send: (record) => this.deliverOutboxEvent(record) });
|
|
186
210
|
this.bus = new RelayBusClient({
|
|
187
211
|
url: relayBusUrl(options.relayUrl),
|
|
188
212
|
role: "provider",
|
|
@@ -251,10 +275,13 @@ export class AgentRunner {
|
|
|
251
275
|
this.control = startControlServer({
|
|
252
276
|
onStatus: (status) => this.setProviderStatus(status),
|
|
253
277
|
onTerminalAttachSpec: () => this.terminalAttachSpec(),
|
|
254
|
-
|
|
278
|
+
// Hot-path-safe: answered instantly from the local snapshot, never a server
|
|
279
|
+
// round-trip. The snapshot is kept warm by the background refresh below (#196).
|
|
280
|
+
onReplyObligations: () => Promise.resolve(this.obligationCache.get()),
|
|
255
281
|
onSessionTurn: (input) => this.publishSessionTurn(input),
|
|
256
282
|
onUserPrompt: (input) => this.handleUserPrompt(input),
|
|
257
283
|
onSessionEnd: (input) => this.handleSessionEnd(input),
|
|
284
|
+
onHookFatal: (report) => this.reportHookFatal(report),
|
|
258
285
|
});
|
|
259
286
|
this.writeRunnerInfoFile();
|
|
260
287
|
this.options.adapter.onStatusChange((status) => {
|
|
@@ -268,12 +295,19 @@ export class AgentRunner {
|
|
|
268
295
|
if (runnerShouldResolveProviderExit(semanticStatus, this.exitCommandInProgress)) this.options.onProviderExit?.(semanticStatus === "offline" ? 0 : 1);
|
|
269
296
|
});
|
|
270
297
|
this.options.adapter.onSessionEvent?.((event) => { void this.publishProviderSessionEvent(event); });
|
|
271
|
-
this.bus.on("message.new", (message) =>
|
|
298
|
+
this.bus.on("message.new", (message) => {
|
|
299
|
+
// A delivered message may create a new reply obligation — warm the snapshot so the
|
|
300
|
+
// next turn-end sees it without a hot-path server read.
|
|
301
|
+
this.obligationCache.markDirty();
|
|
302
|
+
this.enqueueMessage(message as Message);
|
|
303
|
+
});
|
|
272
304
|
this.bus.on("command", (type, params, commandId, command) => {
|
|
273
305
|
void this.handleCommand(type, params, commandId, command);
|
|
274
306
|
});
|
|
275
307
|
this.bus.on("error", (code, message) => this.handleBusError(String(code), String(message)));
|
|
276
308
|
await this.bus.connect();
|
|
309
|
+
this.obligationCache.start();
|
|
310
|
+
this.outbox.start();
|
|
277
311
|
this.ensureScratch();
|
|
278
312
|
void this.sweepStaleScratch();
|
|
279
313
|
this.process = await this.spawnProvider();
|
|
@@ -313,6 +347,8 @@ export class AgentRunner {
|
|
|
313
347
|
this.tokenRenewTimer = undefined;
|
|
314
348
|
this.disarmBusyReconciler();
|
|
315
349
|
this.stopReasoningTail();
|
|
350
|
+
this.obligationCache.stop();
|
|
351
|
+
this.outbox.close();
|
|
316
352
|
this.control?.stop();
|
|
317
353
|
await this.bus.close();
|
|
318
354
|
}
|
|
@@ -387,7 +423,7 @@ export class AgentRunner {
|
|
|
387
423
|
startedAt: this.options.startedAt,
|
|
388
424
|
}, null, 2) + "\n", { mode: 0o600 });
|
|
389
425
|
} catch (error) {
|
|
390
|
-
|
|
426
|
+
logger.error("runner", `failed to write runner info file: ${error}`);
|
|
391
427
|
}
|
|
392
428
|
}
|
|
393
429
|
|
|
@@ -403,7 +439,7 @@ export class AgentRunner {
|
|
|
403
439
|
const messages = await this.http.pollMessages({ for: this.agentId, unread: true, limit: 100 });
|
|
404
440
|
for (const message of messages) this.enqueueMessage(message);
|
|
405
441
|
} catch (error) {
|
|
406
|
-
|
|
442
|
+
logger.error("runner", `inbox bootstrap failed: ${error}`);
|
|
407
443
|
}
|
|
408
444
|
}
|
|
409
445
|
|
|
@@ -413,7 +449,7 @@ export class AgentRunner {
|
|
|
413
449
|
try {
|
|
414
450
|
await this.options.adapter.deliverInitialPrompt(this.process, prompt);
|
|
415
451
|
} catch (error) {
|
|
416
|
-
|
|
452
|
+
logger.error("runner", `initial prompt delivery failed: ${error}`);
|
|
417
453
|
}
|
|
418
454
|
}
|
|
419
455
|
|
|
@@ -450,7 +486,7 @@ export class AgentRunner {
|
|
|
450
486
|
status: "in_progress",
|
|
451
487
|
agentId: this.agentId,
|
|
452
488
|
metadata: { messageId: message.id, completedBy: "runner" },
|
|
453
|
-
}).catch((error) =>
|
|
489
|
+
}).catch((error) => logger.error("task", `task ${taskId} in_progress update failed: ${error}`));
|
|
454
490
|
// Runner owns claim + status here; drop the server's self-claim instruction
|
|
455
491
|
// so the agent doesn't improvise a stray claim send (see stripRunnerClaimedGuidance).
|
|
456
492
|
toDeliver = { ...message, body: stripRunnerClaimedGuidance(message.body) };
|
|
@@ -468,7 +504,7 @@ export class AgentRunner {
|
|
|
468
504
|
try {
|
|
469
505
|
const prepared = await messagesWithCachedAttachments(deliverable, this.http, {
|
|
470
506
|
agentId: this.agentId,
|
|
471
|
-
onError: (message) =>
|
|
507
|
+
onError: (message) => logger.error("runner", message),
|
|
472
508
|
});
|
|
473
509
|
await this.options.adapter.deliver(this.process, prepared);
|
|
474
510
|
for (const message of deliverable) {
|
|
@@ -477,7 +513,7 @@ export class AgentRunner {
|
|
|
477
513
|
}
|
|
478
514
|
} catch (error) {
|
|
479
515
|
failed = true;
|
|
480
|
-
if (shouldLogDeliveryFailure(error))
|
|
516
|
+
if (shouldLogDeliveryFailure(error)) logger.warn("delivery", `message delivery failed: ${error}`);
|
|
481
517
|
for (const message of deliverable) {
|
|
482
518
|
this.clearActiveClaim(message);
|
|
483
519
|
this.pendingMessages.set(message.id, message);
|
|
@@ -545,7 +581,7 @@ export class AgentRunner {
|
|
|
545
581
|
await this.http.deleteAgent(this.agentId).catch(() => {});
|
|
546
582
|
if (this.options.exitProcessOnShutdown !== false) {
|
|
547
583
|
setTimeout(() => void this.stop().catch((error) => {
|
|
548
|
-
|
|
584
|
+
logger.error("lifecycle", `stop after command failed: ${error}`);
|
|
549
585
|
}).finally(() => process.exit(0)), 10);
|
|
550
586
|
}
|
|
551
587
|
} else if (!this.stopped) {
|
|
@@ -680,7 +716,7 @@ export class AgentRunner {
|
|
|
680
716
|
|
|
681
717
|
if (this.shouldStopUnexpectedProviderExit(diagnostics)) {
|
|
682
718
|
const hasResumeId = typeof diagnostics.claudeResumeId === "string" && diagnostics.claudeResumeId.length > 0;
|
|
683
|
-
|
|
719
|
+
logger.warn("lifecycle", `${this.options.provider} exited; leaving agent offline for manual recovery`);
|
|
684
720
|
this.publishRunnerTimelineEvent({
|
|
685
721
|
status: "provider.restart_decision",
|
|
686
722
|
id: `provider-restart-decision-${this.providerSessionId}-${now}`,
|
|
@@ -708,7 +744,7 @@ export class AgentRunner {
|
|
|
708
744
|
}
|
|
709
745
|
|
|
710
746
|
if (runtimeMs < RAPID_EXIT_MS && recent.length > MAX_RAPID_UNEXPECTED_EXITS) {
|
|
711
|
-
|
|
747
|
+
logger.error("lifecycle", `provider session exited ${recent.length} times within ${Math.round(UNEXPECTED_EXIT_WINDOW_MS / 1000)}s; giving up`);
|
|
712
748
|
this.publishRunnerTimelineEvent({
|
|
713
749
|
status: "provider.restart_decision",
|
|
714
750
|
id: `provider-restart-decision-${this.providerSessionId}-${now}`,
|
|
@@ -732,7 +768,7 @@ export class AgentRunner {
|
|
|
732
768
|
}
|
|
733
769
|
|
|
734
770
|
const delayMs = Math.min(10_000, Math.max(500, 500 * recent.length));
|
|
735
|
-
|
|
771
|
+
logger.warn("lifecycle", `provider session exited unexpectedly after ${Math.round(runtimeMs / 1000)}s; restarting in ${delayMs}ms`);
|
|
736
772
|
this.publishRunnerTimelineEvent({
|
|
737
773
|
status: "provider.restart_decision",
|
|
738
774
|
id: `provider-restart-decision-${this.providerSessionId}-${now}`,
|
|
@@ -757,7 +793,7 @@ export class AgentRunner {
|
|
|
757
793
|
this.publishStatus();
|
|
758
794
|
this.scheduleDrain();
|
|
759
795
|
} catch (error) {
|
|
760
|
-
|
|
796
|
+
logger.error("lifecycle", `provider restart after unexpected exit failed: ${error}`);
|
|
761
797
|
this.setProviderStatus("error");
|
|
762
798
|
this.options.onProviderExit?.(1);
|
|
763
799
|
}
|
|
@@ -832,10 +868,10 @@ export class AgentRunner {
|
|
|
832
868
|
private handleBusError(code: string, message: string): void {
|
|
833
869
|
const action = runnerBusErrorAction(code, this.stopped);
|
|
834
870
|
if (action === "ignore") return;
|
|
835
|
-
|
|
871
|
+
logger.error("bus", `bus error ${code}: ${message}`);
|
|
836
872
|
if (action === "stop") {
|
|
837
873
|
void this.stop().catch((error) => {
|
|
838
|
-
|
|
874
|
+
logger.error("bus", `stop after bus error failed: ${error}`);
|
|
839
875
|
}).finally(() => process.exit(0));
|
|
840
876
|
}
|
|
841
877
|
}
|
|
@@ -918,13 +954,10 @@ export class AgentRunner {
|
|
|
918
954
|
replyToMessageId = pendingPrompt;
|
|
919
955
|
this.pendingPromptMessageId = undefined;
|
|
920
956
|
} else {
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
} catch {
|
|
926
|
-
// fall through and capture without correlation
|
|
927
|
-
}
|
|
957
|
+
// Correlation-only (threading + obligation clearing) — the local snapshot is fresh
|
|
958
|
+
// enough and never blocks the response-capture path (#196).
|
|
959
|
+
const obligation = [...this.obligationCache.get()].reverse().find((o) => o.from === "user");
|
|
960
|
+
replyToMessageId = obligation?.messageId;
|
|
928
961
|
}
|
|
929
962
|
|
|
930
963
|
// The Stop hook can fire before the final assistant entry is flushed to disk.
|
|
@@ -966,31 +999,86 @@ export class AgentRunner {
|
|
|
966
999
|
...(replyToMessageId ? { replyTo: replyToMessageId } : {}),
|
|
967
1000
|
session: { type: "response", origin: "provider", ...(turnId ? { turnId } : {}) },
|
|
968
1001
|
});
|
|
1002
|
+
// The agent's reply may have cleared an obligation — refresh the snapshot so the next
|
|
1003
|
+
// turn-end doesn't re-prompt for a message already answered (#196).
|
|
1004
|
+
if (replyToMessageId) this.obligationCache.markDirty();
|
|
969
1005
|
}
|
|
970
1006
|
|
|
971
1007
|
// Post one session-mirror event (prompt echo, assistant response, reasoning or
|
|
972
1008
|
// tool step) as a `kind: "session"` relay message tagged with payload.session so
|
|
973
1009
|
// the dashboard can render the live provider session faithfully. Display-only:
|
|
974
1010
|
// session messages are never delivered back into a provider.
|
|
975
|
-
private
|
|
1011
|
+
private publishSessionEvent(input: {
|
|
976
1012
|
from: string;
|
|
977
1013
|
to: string;
|
|
978
1014
|
body: string;
|
|
979
1015
|
session: MessageSessionMeta;
|
|
980
1016
|
replyTo?: number;
|
|
981
|
-
}):
|
|
982
|
-
|
|
983
|
-
|
|
1017
|
+
}): void {
|
|
1018
|
+
// Durable, ordered, timestamped (#196): the actual POST happens in deliverOutboxEvent,
|
|
1019
|
+
// retried until it lands. occurredAt is stamped now so a queued event reports when it
|
|
1020
|
+
// truly happened, not when the server finally accepted it.
|
|
1021
|
+
this.outbox.enqueue({
|
|
1022
|
+
kind: "session-message",
|
|
1023
|
+
payload: {
|
|
984
1024
|
from: input.from,
|
|
985
1025
|
to: input.to,
|
|
986
1026
|
...(input.replyTo ? { replyTo: input.replyTo } : {}),
|
|
987
1027
|
kind: "session",
|
|
988
1028
|
body: input.body,
|
|
989
1029
|
payload: { session: { provider: this.options.provider, ...input.session } },
|
|
1030
|
+
} satisfies SendMessageInput,
|
|
1031
|
+
});
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
// The outbox transport: map a queued record to its HTTP call. Throw to retry, return to
|
|
1035
|
+
// ack (delete). occurredAt + idempotencyKey are injected from the record so retries are
|
|
1036
|
+
// exactly-once server-side and carry true event time.
|
|
1037
|
+
private async deliverOutboxEvent(record: OutboxRecord): Promise<void> {
|
|
1038
|
+
try {
|
|
1039
|
+
if (record.kind === "session-message") {
|
|
1040
|
+
await this.http.sendMessage({
|
|
1041
|
+
...(record.payload as SendMessageInput),
|
|
1042
|
+
occurredAt: record.occurredAt,
|
|
1043
|
+
idempotencyKey: record.idempotencyKey,
|
|
1044
|
+
});
|
|
1045
|
+
return;
|
|
1046
|
+
}
|
|
1047
|
+
if (record.kind === "insight") {
|
|
1048
|
+
await this.http.recordInsightObservation({
|
|
1049
|
+
...(record.payload as Parameters<RelayHttpClient["recordInsightObservation"]>[0]),
|
|
1050
|
+
occurredAt: record.occurredAt,
|
|
1051
|
+
});
|
|
1052
|
+
return;
|
|
1053
|
+
}
|
|
1054
|
+
logger.warn("outbox", `dropping event with unknown kind: ${record.kind}`);
|
|
1055
|
+
} catch (error) {
|
|
1056
|
+
// 409 = the server intentionally rejected it (e.g. Insights/feature toggled off). That
|
|
1057
|
+
// is a permanent "don't want this", not a transient failure — ack so it doesn't retry.
|
|
1058
|
+
if (isHttpStatusError(error, 409)) return;
|
|
1059
|
+
if (isHttpAuthError(error)) this.recoverRuntimeTokenAfterAuthFailure("outbox");
|
|
1060
|
+
throw error; // transient (or auth, post-recovery) → let the outbox retry with backoff
|
|
1061
|
+
}
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
// A hook reported an unhandled failure (#198 seam). Already logged FATAL by the control
|
|
1065
|
+
// server; here we additionally surface it durably to the server as a generic insight so
|
|
1066
|
+
// it shows up in observability rather than only in the per-agent log (#196).
|
|
1067
|
+
private reportHookFatal(report: { hook: string; error: string }): void {
|
|
1068
|
+
try {
|
|
1069
|
+
this.outbox.enqueue({
|
|
1070
|
+
kind: "insight",
|
|
1071
|
+
payload: {
|
|
1072
|
+
sessionId: this.providerSessionId,
|
|
1073
|
+
project: this.options.cwd,
|
|
1074
|
+
agentId: this.agentId,
|
|
1075
|
+
signal: "hook_fatal",
|
|
1076
|
+
value: { hook: report.hook, error: report.error },
|
|
1077
|
+
source: "server",
|
|
1078
|
+
},
|
|
990
1079
|
});
|
|
991
1080
|
} catch (error) {
|
|
992
|
-
|
|
993
|
-
if (isHttpAuthError(error)) this.recoverRuntimeTokenAfterAuthFailure("session-capture");
|
|
1081
|
+
logger.error("outbox", `failed to queue hook-fatal report: ${error instanceof Error ? error.message : String(error)}`);
|
|
994
1082
|
}
|
|
995
1083
|
}
|
|
996
1084
|
|
|
@@ -1034,8 +1122,11 @@ export class AgentRunner {
|
|
|
1034
1122
|
}
|
|
1035
1123
|
const analysis = analyzeSession(jsonl);
|
|
1036
1124
|
if (!analysis) return; // no tool calls = nothing substantive to measure
|
|
1037
|
-
|
|
1038
|
-
|
|
1125
|
+
// Durable + non-blocking (#196): queue it. SessionEnd can race provider shutdown, so a
|
|
1126
|
+
// direct POST risked being dropped if the server hiccuped; the outbox survives that.
|
|
1127
|
+
this.outbox.enqueue({
|
|
1128
|
+
kind: "insight",
|
|
1129
|
+
payload: {
|
|
1039
1130
|
sessionId: this.providerSessionId,
|
|
1040
1131
|
project: this.options.cwd,
|
|
1041
1132
|
agentId: this.agentId,
|
|
@@ -1043,13 +1134,9 @@ export class AgentRunner {
|
|
|
1043
1134
|
value: { ...analysis.metric, ...(input.reason ? { endReason: input.reason } : {}) },
|
|
1044
1135
|
outcome: { ...analysis.outcome },
|
|
1045
1136
|
source: "server",
|
|
1046
|
-
}
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
// 409 = Insights/feature toggled off; anything else is best-effort too.
|
|
1050
|
-
this.sessionDebug(`insights context_ratio skipped: ${error instanceof Error ? error.message : String(error)}`);
|
|
1051
|
-
if (isHttpAuthError(error)) this.recoverRuntimeTokenAfterAuthFailure("insights");
|
|
1052
|
-
}
|
|
1137
|
+
},
|
|
1138
|
+
});
|
|
1139
|
+
this.sessionLog(`insights: context_ratio ${analysis.metric.ratio.toFixed(2)} (${analysis.metric.gatheringCalls}/${analysis.metric.totalToolCalls} gathering) queued`);
|
|
1053
1140
|
}
|
|
1054
1141
|
|
|
1055
1142
|
// Route a provider-emitted session event (Codex app-server) into the chat mirror.
|
|
@@ -1078,13 +1165,9 @@ export class AgentRunner {
|
|
|
1078
1165
|
if (pendingPrompt) {
|
|
1079
1166
|
replyToMessageId = pendingPrompt;
|
|
1080
1167
|
this.pendingPromptMessageId = undefined;
|
|
1081
|
-
} else {
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
if (obligations.some((o) => o.from === "user")) return;
|
|
1085
|
-
} catch {
|
|
1086
|
-
// capture anyway on lookup failure
|
|
1087
|
-
}
|
|
1168
|
+
} else if (this.obligationCache.get().some((o) => o.from === "user")) {
|
|
1169
|
+
// The agent will answer the relay obligation itself — don't double-post (#196).
|
|
1170
|
+
return;
|
|
1088
1171
|
}
|
|
1089
1172
|
await this.publishSessionEvent({
|
|
1090
1173
|
from: this.agentId,
|
|
@@ -1363,36 +1446,24 @@ export class AgentRunner {
|
|
|
1363
1446
|
this.logRunnerDiagnostic(`[runner] HTTP liveness update failed: ${suffix}`);
|
|
1364
1447
|
}
|
|
1365
1448
|
|
|
1449
|
+
// Runner operational diagnostics (HTTP liveness, token renewal failures). Routed
|
|
1450
|
+
// through the leveled logger at warn — see logger.ts. Kept as a thin wrapper so
|
|
1451
|
+
// the existing call sites and their `[runner]` framing stay put.
|
|
1366
1452
|
private logRunnerDiagnostic(message: string): void {
|
|
1367
|
-
|
|
1368
|
-
console.error(message);
|
|
1369
|
-
return;
|
|
1370
|
-
}
|
|
1371
|
-
try {
|
|
1372
|
-
const logDir = join(process.env.HOME || ".", ".agent-relay", "logs");
|
|
1373
|
-
mkdirSync(logDir, { recursive: true });
|
|
1374
|
-
appendFileSync(join(logDir, `runner-${safeLogName(this.agentId)}.log`), `[${new Date().toISOString()}] ${message}\n`);
|
|
1375
|
-
} catch {
|
|
1376
|
-
// Do not write runner diagnostics into an interactive provider TUI.
|
|
1377
|
-
}
|
|
1453
|
+
logger.warn("runner", message.replace(/^\[runner\]\s*/, ""));
|
|
1378
1454
|
}
|
|
1379
1455
|
|
|
1380
|
-
// Session-mirror diagnostics →
|
|
1381
|
-
//
|
|
1382
|
-
// to look when chat/terminal sync misbehaves.
|
|
1456
|
+
// Session-mirror diagnostics → the leveled logger (component "mirror"), written
|
|
1457
|
+
// to the dashboard-surfaced session-mirror-<agent>.log. Key transitions log at
|
|
1458
|
+
// info; the single place to look when chat/terminal sync misbehaves.
|
|
1383
1459
|
private sessionLog(message: string): void {
|
|
1384
|
-
|
|
1385
|
-
const logDir = join(process.env.HOME || ".", ".agent-relay", "logs");
|
|
1386
|
-
mkdirSync(logDir, { recursive: true });
|
|
1387
|
-
appendFileSync(join(logDir, `session-mirror-${safeLogName(this.agentId)}.log`), `[${new Date().toISOString()}] ${message}\n`);
|
|
1388
|
-
} catch {
|
|
1389
|
-
// best-effort
|
|
1390
|
-
}
|
|
1460
|
+
logger.info("mirror", message);
|
|
1391
1461
|
}
|
|
1392
1462
|
|
|
1393
|
-
// Verbose, high-frequency lines (per-probe, per-emit) — only
|
|
1463
|
+
// Verbose, high-frequency lines (per-probe, per-emit) — surfaced only at log
|
|
1464
|
+
// level "debug" (AGENT_RELAY_LOG_LEVEL=debug, or flip live via /log-level).
|
|
1394
1465
|
private sessionDebug(message: string): void {
|
|
1395
|
-
|
|
1466
|
+
logger.debug("mirror", message);
|
|
1396
1467
|
}
|
|
1397
1468
|
|
|
1398
1469
|
private ensureScratch(): void {
|
|
@@ -1657,7 +1728,7 @@ export class AgentRunner {
|
|
|
1657
1728
|
})
|
|
1658
1729
|
.then(() => true)
|
|
1659
1730
|
.catch((error) => {
|
|
1660
|
-
|
|
1731
|
+
logger.error("task", `task ${claim.taskId} completion update failed: ${error}`);
|
|
1661
1732
|
return false;
|
|
1662
1733
|
});
|
|
1663
1734
|
if (!ok) continue;
|
|
@@ -1956,16 +2027,17 @@ function isHttpAuthError(error: unknown): boolean {
|
|
|
1956
2027
|
return status === 401 || status === 403;
|
|
1957
2028
|
}
|
|
1958
2029
|
|
|
2030
|
+
function isHttpStatusError(error: unknown, code: number): boolean {
|
|
2031
|
+
const status = typeof error === "object" && error !== null ? (error as { status?: unknown }).status : undefined;
|
|
2032
|
+
return status === code;
|
|
2033
|
+
}
|
|
2034
|
+
|
|
1959
2035
|
function httpErrorKey(error: unknown): string {
|
|
1960
2036
|
const status = typeof error === "object" && error !== null ? (error as { status?: unknown }).status : undefined;
|
|
1961
2037
|
if (typeof status === "number") return `status:${status}`;
|
|
1962
2038
|
return String(error);
|
|
1963
2039
|
}
|
|
1964
2040
|
|
|
1965
|
-
function safeLogName(value: string): string {
|
|
1966
|
-
return value.replace(/[^a-zA-Z0-9_.-]+/g, "_").slice(0, 180);
|
|
1967
|
-
}
|
|
1968
|
-
|
|
1969
2041
|
function isContextState(value: unknown): value is ContextState {
|
|
1970
2042
|
if (!value || typeof value !== "object" || Array.isArray(value)) return false;
|
|
1971
2043
|
const state = value as Record<string, unknown>;
|