agent-relay-runner 0.15.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/plugins/claude/.claude-plugin/plugin.json +1 -1
- package/plugins/claude/hooks/permission-request.sh +2 -0
- package/plugins/claude/hooks/post-compact.sh +1 -0
- package/plugins/claude/hooks/pre-compact.sh +1 -0
- package/plugins/claude/hooks/relay-status.sh +28 -1
- package/plugins/claude/hooks/session-end.sh +1 -0
- package/plugins/claude/hooks/session-start.sh +1 -0
- package/plugins/claude/hooks/stop-failure.sh +1 -0
- package/plugins/claude/hooks/stop.sh +14 -2
- package/plugins/claude/hooks/subagent-start.sh +1 -0
- package/plugins/claude/hooks/subagent-stop.sh +1 -0
- package/plugins/claude/hooks/user-prompt-submit.sh +1 -0
- package/src/adapters/codex.ts +3 -2
- package/src/control-server.ts +43 -0
- package/src/logger.ts +97 -0
- package/src/runner.ts +53 -51
package/package.json
CHANGED
|
@@ -92,11 +92,38 @@ relay_post_session_end() {
|
|
|
92
92
|
-d "$body" >/dev/null 2>&1 || true
|
|
93
93
|
}
|
|
94
94
|
|
|
95
|
+
# --- Hook FATAL surfacing (#198) -------------------------------------------
|
|
96
|
+
# A hook that dies unexpectedly must never be silent. relay_install_hook_guard
|
|
97
|
+
# arms an ERR trap that reports the failure FATAL to the runner control port,
|
|
98
|
+
# which logs it to the dashboard-surfaced per-agent log. Best-effort and bounded
|
|
99
|
+
# (--max-time 2) so the report itself can never blow the hook's timeout budget.
|
|
100
|
+
relay_hook_fatal_report() {
|
|
101
|
+
local hook="${1:-unknown}" detail="${2:-}"
|
|
102
|
+
local port="${AGENT_RELAY_RUNNER_PORT:-}"
|
|
103
|
+
[ -z "$port" ] && return 0
|
|
104
|
+
local body="{\"hook\":\"$(relay_json_escape "$hook")\",\"error\":\"$(relay_json_escape "$detail")\"}"
|
|
105
|
+
curl -fsS --max-time 2 -X POST "http://127.0.0.1:${port}/hook-fatal" \
|
|
106
|
+
-H 'Content-Type: application/json' \
|
|
107
|
+
-d "$body" >/dev/null 2>&1 || true
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
relay_install_hook_guard() {
|
|
111
|
+
RELAY_HOOK_NAME="${1:-unknown}"
|
|
112
|
+
# Fires on any unhandled failure under `set -e`/`set -u`/pipefail in the hook's
|
|
113
|
+
# main body, just before the shell exits. Reports, then lets the exit proceed.
|
|
114
|
+
# (ERR is not inherited into functions without `set -E`; this covers the top-level
|
|
115
|
+
# flow, which is where a silent death actually wedges a turn.)
|
|
116
|
+
trap 'relay_hook_err_rc=$?; relay_hook_fatal_report "${RELAY_HOOK_NAME:-unknown}" "exit ${relay_hook_err_rc}: ${BASH_COMMAND}"' ERR
|
|
117
|
+
}
|
|
118
|
+
|
|
95
119
|
relay_pending_reply_stop_decision() {
|
|
96
120
|
local port="${AGENT_RELAY_RUNNER_PORT:-}"
|
|
97
121
|
[ -z "$port" ] && return 0
|
|
98
122
|
local response
|
|
99
|
-
|
|
123
|
+
# --max-time guards the Claude Stop hook's 5s budget: a slow runner/server (e.g. an
|
|
124
|
+
# un-indexed obligation query) must never block past the timeout, or Claude SIGKILLs
|
|
125
|
+
# the hook before it clears the turn -> stuck "busy" (#199). On timeout: no block.
|
|
126
|
+
response="$(curl -fsS --max-time 2 "http://127.0.0.1:${port}/reply-obligations/claude-stop" 2>/dev/null || true)"
|
|
100
127
|
case "$response" in
|
|
101
128
|
*'"decision":"block"'*|*'"decision": "block"'*) ;;
|
|
102
129
|
*) return 0 ;;
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env bash
|
|
2
2
|
set -euo pipefail
|
|
3
3
|
source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
|
|
4
|
+
relay_install_hook_guard session-end
|
|
4
5
|
|
|
5
6
|
payload="$(cat || true)"
|
|
6
7
|
reason="$(relay_json_string_field reason "$payload")"
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env bash
|
|
2
2
|
set -euo pipefail
|
|
3
3
|
source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
|
|
4
|
+
relay_install_hook_guard session-start
|
|
4
5
|
|
|
5
6
|
payload="$(cat || true)"
|
|
6
7
|
source_kind="$(relay_json_string_field source "$payload")"
|
|
@@ -1,17 +1,29 @@
|
|
|
1
1
|
#!/usr/bin/env bash
|
|
2
2
|
set -euo pipefail
|
|
3
3
|
source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
|
|
4
|
+
relay_install_hook_guard stop
|
|
5
|
+
|
|
6
|
+
# Clearing the turn's busy state is the critical path (#199). Register it on EXIT
|
|
7
|
+
# so it runs even if a side-call below fails or times out under `set -e`. The one
|
|
8
|
+
# exception is the reply-obligation block path, which deliberately keeps the agent
|
|
9
|
+
# busy to answer — it opts out via the flag before exiting.
|
|
10
|
+
_relay_clear_idle_on_exit=1
|
|
11
|
+
trap '[ "${_relay_clear_idle_on_exit:-0}" = "1" ] && relay_post_status_clearing_subagents idle' EXIT
|
|
4
12
|
|
|
5
13
|
payload="$(cat || true)"
|
|
6
14
|
stop_hook_active="$(relay_json_bool_field stop_hook_active "$payload")"
|
|
7
15
|
if [ "$stop_hook_active" != "true" ]; then
|
|
8
16
|
last_assistant_msg="$(echo "$payload" | jq -c '.last_assistant_message // empty' 2>/dev/null || true)"
|
|
9
17
|
relay_post_session_turn "$(relay_json_string_field transcript_path "$payload")" "$last_assistant_msg"
|
|
10
|
-
|
|
18
|
+
# `|| true`: under `set -e`, a non-zero from the obligation check must never abort
|
|
19
|
+
# the hook before the idle-clear — clearing the turn is the critical path (#199).
|
|
20
|
+
stop_decision="$(relay_pending_reply_stop_decision || true)"
|
|
11
21
|
if [ -n "$stop_decision" ]; then
|
|
22
|
+
_relay_clear_idle_on_exit=0
|
|
12
23
|
printf '%s\n' "$stop_decision"
|
|
13
24
|
exit 0
|
|
14
25
|
fi
|
|
15
26
|
fi
|
|
16
27
|
|
|
17
|
-
|
|
28
|
+
# Normal turn end → the EXIT trap posts idle (always, even on an unexpected abort above).
|
|
29
|
+
exit 0
|
|
@@ -4,6 +4,7 @@ set -euo pipefail
|
|
|
4
4
|
PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
|
|
5
5
|
# shellcheck source=/dev/null
|
|
6
6
|
source "${PLUGIN_ROOT}/hooks/relay-status.sh"
|
|
7
|
+
relay_install_hook_guard subagent-start
|
|
7
8
|
|
|
8
9
|
payload="$(cat || true)"
|
|
9
10
|
agent_id="$(relay_json_string_field agent_id "$payload")"
|
|
@@ -4,6 +4,7 @@ set -euo pipefail
|
|
|
4
4
|
PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}"
|
|
5
5
|
# shellcheck source=/dev/null
|
|
6
6
|
source "${PLUGIN_ROOT}/hooks/relay-status.sh"
|
|
7
|
+
relay_install_hook_guard subagent-stop
|
|
7
8
|
|
|
8
9
|
payload="$(cat || true)"
|
|
9
10
|
agent_id="$(relay_json_string_field agent_id "$payload")"
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env bash
|
|
2
2
|
set -euo pipefail
|
|
3
3
|
source "${CLAUDE_PLUGIN_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)}/hooks/relay-status.sh"
|
|
4
|
+
relay_install_hook_guard user-prompt-submit
|
|
4
5
|
payload="$(cat || true)"
|
|
5
6
|
relay_post_status busy
|
|
6
7
|
# Mirror a terminal/TUI-typed prompt into the dashboard chat and start reasoning
|
package/src/adapters/codex.ts
CHANGED
|
@@ -4,6 +4,7 @@ import { basename, join, resolve } from "node:path";
|
|
|
4
4
|
import type { ContextState, Message } from "agent-relay-sdk";
|
|
5
5
|
import { profileAllowsRelayFeature, providerMessageText, RELAY_CONTEXT, type ManagedProcess, type ProviderAdapter, type ProviderConfig, type ProviderPermissionDecisionInput, type ProviderSessionEvent, type ProviderStatusUpdate, type RunnerSpawnConfig, type SpawnArgs, type TerminalAttachSpec } from "../adapter";
|
|
6
6
|
import { workspaceDepsNoteFromEnv } from "../relay-instructions";
|
|
7
|
+
import { logger } from "../logger";
|
|
7
8
|
|
|
8
9
|
/** Relay context prepended to a Codex agent's first turn: the standard relay
|
|
9
10
|
* blurb plus, when running in an isolated workspace, the deps caveat (#159). */
|
|
@@ -199,7 +200,7 @@ export class CodexAdapter implements ProviderAdapter {
|
|
|
199
200
|
input = codexRelayContextBlock() + "\n\n" + input;
|
|
200
201
|
process.meta = { ...(process.meta ?? {}), relayContextSent: true };
|
|
201
202
|
}
|
|
202
|
-
|
|
203
|
+
logger.info("codex", `starting Codex initial prompt in thread ${threadId}`);
|
|
203
204
|
const client = process.meta?.client as CodexAppClient;
|
|
204
205
|
await client.turnStart(threadId, input);
|
|
205
206
|
}
|
|
@@ -211,7 +212,7 @@ export class CodexAdapter implements ProviderAdapter {
|
|
|
211
212
|
text = codexRelayContextBlock() + "\n\n" + text;
|
|
212
213
|
process.meta = { ...(process.meta ?? {}), relayContextSent: true };
|
|
213
214
|
}
|
|
214
|
-
|
|
215
|
+
logger.info("codex", codexDeliveryNotice(messages, threadId));
|
|
215
216
|
const client = process.meta?.client as CodexAppClient;
|
|
216
217
|
await client.turnStart(threadId, text);
|
|
217
218
|
}
|
package/src/control-server.ts
CHANGED
|
@@ -1,6 +1,16 @@
|
|
|
1
1
|
import type { Server, ServerWebSocket } from "bun";
|
|
2
2
|
import type { Message, ReplyObligation } from "agent-relay-sdk";
|
|
3
3
|
import type { ProviderPermissionDecisionInput, ProviderStatusEvent, SemanticStatus, TerminalAttachSpec } from "./adapter";
|
|
4
|
+
import { logger, parseLogLevel, LOG_LEVELS } from "./logger";
|
|
5
|
+
|
|
6
|
+
// A hook that failed in a way it could not handle itself reports here so the
|
|
7
|
+
// failure is never silent (#198 item 5). Phase 1 logs it FATAL to the per-agent
|
|
8
|
+
// log; Phase 2 (#196) will additionally route it through the runner outbox to the
|
|
9
|
+
// server.
|
|
10
|
+
export interface HookFatalReport {
|
|
11
|
+
hook: string;
|
|
12
|
+
error: string;
|
|
13
|
+
}
|
|
4
14
|
|
|
5
15
|
interface MonitorSocketData {
|
|
6
16
|
kind: "monitor";
|
|
@@ -33,6 +43,10 @@ interface ControlServerOptions {
|
|
|
33
43
|
// transcript. transcriptPath is optional — the runner falls back to the last
|
|
34
44
|
// path it saw during the session.
|
|
35
45
|
onSessionEnd?(input: { reason?: string; transcriptPath?: string }): Promise<void>;
|
|
46
|
+
// Phase 1 observability (#198): a hook reporting an unhandled failure. The
|
|
47
|
+
// control server already logs it FATAL; this is the seam for Phase 2 to also
|
|
48
|
+
// surface it to the server via the runner outbox.
|
|
49
|
+
onHookFatal?(report: HookFatalReport): void;
|
|
36
50
|
}
|
|
37
51
|
|
|
38
52
|
export function startControlServer(options: ControlServerOptions): ControlServer {
|
|
@@ -81,6 +95,15 @@ export function startControlServer(options: ControlServerOptions): ControlServer
|
|
|
81
95
|
if (url.pathname === "/session-end" && req.method === "POST") {
|
|
82
96
|
return handleSessionEnd(req, options);
|
|
83
97
|
}
|
|
98
|
+
if (url.pathname === "/log-level" && req.method === "GET") {
|
|
99
|
+
return Response.json({ level: logger.getLevel(), levels: LOG_LEVELS });
|
|
100
|
+
}
|
|
101
|
+
if (url.pathname === "/log-level" && req.method === "POST") {
|
|
102
|
+
return handleLogLevel(req);
|
|
103
|
+
}
|
|
104
|
+
if (url.pathname === "/hook-fatal" && req.method === "POST") {
|
|
105
|
+
return handleHookFatal(req, options);
|
|
106
|
+
}
|
|
84
107
|
if (url.pathname === "/monitor") {
|
|
85
108
|
const upgraded = srv.upgrade(req, { data: { kind: "monitor" } });
|
|
86
109
|
return upgraded ? undefined : new Response("WebSocket upgrade failed", { status: 400 });
|
|
@@ -361,6 +384,26 @@ async function handleSessionEnd(req: Request, options: ControlServerOptions): Pr
|
|
|
361
384
|
return Response.json({ ok: true });
|
|
362
385
|
}
|
|
363
386
|
|
|
387
|
+
async function handleLogLevel(req: Request): Promise<Response> {
|
|
388
|
+
const body = await req.json().catch(() => null);
|
|
389
|
+
const level = parseLogLevel(isRecord(body) && typeof body.level === "string" ? body.level : undefined);
|
|
390
|
+
if (!level) return Response.json({ error: `level must be one of: ${LOG_LEVELS.join(", ")}` }, { status: 400 });
|
|
391
|
+
const previous = logger.getLevel();
|
|
392
|
+
logger.setLevel(level);
|
|
393
|
+
logger.info("logger", `log level set to ${level} (was ${previous}) via control port`);
|
|
394
|
+
return Response.json({ ok: true, level, previous });
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
async function handleHookFatal(req: Request, options: ControlServerOptions): Promise<Response> {
|
|
398
|
+
const body = await req.json().catch(() => null);
|
|
399
|
+
const hook = isRecord(body) && typeof body.hook === "string" && body.hook.trim() ? body.hook.trim() : "unknown";
|
|
400
|
+
const error = isRecord(body) && typeof body.error === "string" ? body.error : "(no detail)";
|
|
401
|
+
// Never silent: a hook that couldn't handle its own failure lands here as FATAL.
|
|
402
|
+
logger.fatal(`hook:${hook}`, error);
|
|
403
|
+
try { options.onHookFatal?.({ hook, error }); } catch { /* reporting must never throw back at the hook */ }
|
|
404
|
+
return Response.json({ ok: true });
|
|
405
|
+
}
|
|
406
|
+
|
|
364
407
|
async function handleStatus(req: Request, options: ControlServerOptions): Promise<Response> {
|
|
365
408
|
const body = await req.json().catch(() => null) as Partial<ProviderStatusEvent> | null;
|
|
366
409
|
const status = body?.status;
|
package/src/logger.ts
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { appendFileSync, mkdirSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
|
|
4
|
+
// Phase 1 observability (#198): one leveled, runtime-togglable logger for the
|
|
5
|
+
// Runner and the provider adapters below it. Replaces the ad-hoc scatter of
|
|
6
|
+
// `console.error`, `logRunnerDiagnostic` (-> runner-<agent>.log) and
|
|
7
|
+
// `sessionLog`/`sessionDebug` (-> session-mirror-<agent>.log) with a single
|
|
8
|
+
// switch and a single greppable, ANSI-free sink.
|
|
9
|
+
//
|
|
10
|
+
// Sink: the per-agent `session-mirror-<agent>.log` — the file the orchestrator
|
|
11
|
+
// already surfaces to the dashboard log-viewer (captureSessionMirror). One place
|
|
12
|
+
// to look when anything in the Runner misbehaves.
|
|
13
|
+
//
|
|
14
|
+
// Level is read once from AGENT_RELAY_LOG_LEVEL (default "info") and can be
|
|
15
|
+
// flipped at runtime via the control port (no restart) — so a phase refactor can
|
|
16
|
+
// be watched at debug without bouncing the agent.
|
|
17
|
+
|
|
18
|
+
export type LogLevel = "debug" | "info" | "warn" | "error" | "fatal";
|
|
19
|
+
|
|
20
|
+
const ORDER: Record<LogLevel, number> = { debug: 10, info: 20, warn: 30, error: 40, fatal: 50 };
|
|
21
|
+
export const LOG_LEVELS = Object.keys(ORDER) as LogLevel[];
|
|
22
|
+
|
|
23
|
+
export function parseLogLevel(value: string | undefined | null): LogLevel | undefined {
|
|
24
|
+
if (!value) return undefined;
|
|
25
|
+
const v = value.trim().toLowerCase();
|
|
26
|
+
return (LOG_LEVELS as string[]).includes(v) ? (v as LogLevel) : undefined;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Matches the runner's safeLogName and the orchestrator's safeMirrorLogName so all
|
|
30
|
+
// three resolve the identical filename for a given agent id.
|
|
31
|
+
function safeLogName(value: string): string {
|
|
32
|
+
return value.replace(/[^a-zA-Z0-9_.-]+/g, "_").slice(0, 180);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export interface LoggerConfig {
|
|
36
|
+
agentId?: string;
|
|
37
|
+
level?: LogLevel;
|
|
38
|
+
headless?: boolean;
|
|
39
|
+
logDir?: string;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export class Logger {
|
|
43
|
+
private level: LogLevel;
|
|
44
|
+
private agentId: string;
|
|
45
|
+
private headless: boolean;
|
|
46
|
+
private logDir: string;
|
|
47
|
+
|
|
48
|
+
constructor(config: LoggerConfig = {}) {
|
|
49
|
+
this.level = config.level ?? parseLogLevel(process.env.AGENT_RELAY_LOG_LEVEL) ?? "info";
|
|
50
|
+
this.agentId = config.agentId ?? "runner";
|
|
51
|
+
this.headless = config.headless ?? false;
|
|
52
|
+
this.logDir = config.logDir ?? join(process.env.HOME || ".", ".agent-relay", "logs");
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Bind the logger to a concrete agent once the runner knows its id. Preserves a
|
|
56
|
+
// level already set via env/runtime unless an explicit level is passed.
|
|
57
|
+
configure(config: LoggerConfig): void {
|
|
58
|
+
if (config.agentId !== undefined) this.agentId = config.agentId;
|
|
59
|
+
if (config.headless !== undefined) this.headless = config.headless;
|
|
60
|
+
if (config.logDir !== undefined) this.logDir = config.logDir;
|
|
61
|
+
if (config.level !== undefined) this.level = config.level;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
setLevel(level: LogLevel): void { this.level = level; }
|
|
65
|
+
getLevel(): LogLevel { return this.level; }
|
|
66
|
+
isEnabled(level: LogLevel): boolean { return ORDER[level] >= ORDER[this.level]; }
|
|
67
|
+
|
|
68
|
+
debug(component: string, message: string): void { this.log("debug", component, message); }
|
|
69
|
+
info(component: string, message: string): void { this.log("info", component, message); }
|
|
70
|
+
warn(component: string, message: string): void { this.log("warn", component, message); }
|
|
71
|
+
error(component: string, message: string): void { this.log("error", component, message); }
|
|
72
|
+
fatal(component: string, message: string): void { this.log("fatal", component, message); }
|
|
73
|
+
|
|
74
|
+
log(level: LogLevel, component: string, message: string): void {
|
|
75
|
+
if (!this.isEnabled(level)) return;
|
|
76
|
+
const line = `[${new Date().toISOString()}] ${level.toUpperCase().padEnd(5)} [${component}] ${oneLine(message)}\n`;
|
|
77
|
+
try {
|
|
78
|
+
mkdirSync(this.logDir, { recursive: true });
|
|
79
|
+
appendFileSync(join(this.logDir, `session-mirror-${safeLogName(this.agentId)}.log`), line);
|
|
80
|
+
} catch {
|
|
81
|
+
// Best-effort. If the per-agent file can't be written, surface error/fatal to
|
|
82
|
+
// stderr so it is not lost entirely (headless: lands in the orchestrator log).
|
|
83
|
+
if (ORDER[level] >= ORDER.error) { try { console.error(line.trimEnd()); } catch { /* give up */ } }
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Newlines would split one record across several log lines and break greppability;
|
|
89
|
+
// collapse them so a multi-line message stays one line.
|
|
90
|
+
function oneLine(message: string): string {
|
|
91
|
+
return message.replace(/\r?\n/g, " ⏎ ");
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// Process-global logger. A runner process serves exactly one agent, so a singleton
|
|
95
|
+
// is the right scope; the runner calls configure() once it knows its id, and
|
|
96
|
+
// adapters import this instance directly (no constructor threading).
|
|
97
|
+
export const logger = new Logger();
|
package/src/runner.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { hostname } from "node:os";
|
|
2
|
-
import {
|
|
2
|
+
import { closeSync, mkdirSync, openSync, readSync, statSync, writeFileSync } from "node:fs";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { dirname, join } from "node:path";
|
|
5
5
|
import type { AgentProfile, ContextState, Message, MessageSessionMeta, ProviderCapabilities, TaskStatusInput, WorkspaceMetadata } from "agent-relay-sdk";
|
|
@@ -13,6 +13,7 @@ import { extractLastAssistantTurn, extractFinalAssistantMessage, extractHookAssi
|
|
|
13
13
|
import { agentProfileProjectionReport } from "./profile-projection";
|
|
14
14
|
import { profileUsesHostProviderGlobals } from "./profile-home";
|
|
15
15
|
import { runtimeMetadata } from "./version";
|
|
16
|
+
import { logger, parseLogLevel } from "./logger";
|
|
16
17
|
import { ensureSessionScratch, reapSessionScratch, sweepStaleSessions, type SessionScratchLayout } from "./session-scratch";
|
|
17
18
|
|
|
18
19
|
interface RunnerOptions {
|
|
@@ -76,12 +77,18 @@ const LOG_TAIL_BYTES = 128 * 1024;
|
|
|
76
77
|
const PROMPT_ECHO_DEDUP_MS = 30_000;
|
|
77
78
|
// Busy reconciler: a conservative LAST-RESORT backstop for a turn that ended
|
|
78
79
|
// without the provider's Stop hook clearing busy (e.g. ESC straight into the web
|
|
79
|
-
// terminal). It must never fire during a live turn, so it
|
|
80
|
-
// after it has actually observed the provider busy, and (b) requires a long,
|
|
80
|
+
// terminal). It must never fire during a live turn, so it requires a long,
|
|
81
81
|
// unbroken idle streak — an active turn shows its working spinner well within
|
|
82
82
|
// this window, which resets the streak. ~32s of uninterrupted idle = really done.
|
|
83
83
|
const BUSY_RECONCILE_POLL_MS = 4_000;
|
|
84
84
|
const BUSY_RECONCILE_IDLE_CONFIRM = 8;
|
|
85
|
+
// When the reconciler never observed the provider busy this turn (a turn faster
|
|
86
|
+
// than the 4s poll — common for short voice/autosend replies), it can't trust a
|
|
87
|
+
// quick idle the way it does after seeing the spinner. But refusing forever wedged
|
|
88
|
+
// fast turns in "busy" when the Stop hook's idle was lost (#199). So we still
|
|
89
|
+
// force-clear, just after a much longer unbroken-idle window — an active turn would
|
|
90
|
+
// have flashed its spinner into at least one of these probes and reset the streak.
|
|
91
|
+
const BUSY_RECONCILE_IDLE_CONFIRM_NO_BUSY = 15;
|
|
85
92
|
// After a dashboard interrupt, give the provider a moment to drop out of its turn,
|
|
86
93
|
// then reconcile immediately so the user sees "stopped" without waiting for the backstop.
|
|
87
94
|
const INTERRUPT_RECONCILE_DELAY_MS = 1_500;
|
|
@@ -171,6 +178,14 @@ export class AgentRunner {
|
|
|
171
178
|
|
|
172
179
|
constructor(private readonly options: RunnerOptions) {
|
|
173
180
|
this.agentId = options.agentId ?? options.runnerId;
|
|
181
|
+
// Bind the process-global logger to this agent. AGENT_RELAY_SESSION_DEBUG=1 is
|
|
182
|
+
// kept as a back-compat alias for the verbose probe/emit lines, now expressed
|
|
183
|
+
// as log level "debug" (AGENT_RELAY_LOG_LEVEL still wins when both are set).
|
|
184
|
+
logger.configure({
|
|
185
|
+
agentId: this.agentId,
|
|
186
|
+
headless: options.headless,
|
|
187
|
+
...(this.sessionDebugVerbose && !parseLogLevel(process.env.AGENT_RELAY_LOG_LEVEL) ? { level: "debug" as const } : {}),
|
|
188
|
+
});
|
|
174
189
|
this.currentToken = options.token;
|
|
175
190
|
this.currentTokenJti = options.tokenJti;
|
|
176
191
|
this.currentTokenProfileId = options.tokenProfileId;
|
|
@@ -381,7 +396,7 @@ export class AgentRunner {
|
|
|
381
396
|
startedAt: this.options.startedAt,
|
|
382
397
|
}, null, 2) + "\n", { mode: 0o600 });
|
|
383
398
|
} catch (error) {
|
|
384
|
-
|
|
399
|
+
logger.error("runner", `failed to write runner info file: ${error}`);
|
|
385
400
|
}
|
|
386
401
|
}
|
|
387
402
|
|
|
@@ -397,7 +412,7 @@ export class AgentRunner {
|
|
|
397
412
|
const messages = await this.http.pollMessages({ for: this.agentId, unread: true, limit: 100 });
|
|
398
413
|
for (const message of messages) this.enqueueMessage(message);
|
|
399
414
|
} catch (error) {
|
|
400
|
-
|
|
415
|
+
logger.error("runner", `inbox bootstrap failed: ${error}`);
|
|
401
416
|
}
|
|
402
417
|
}
|
|
403
418
|
|
|
@@ -407,7 +422,7 @@ export class AgentRunner {
|
|
|
407
422
|
try {
|
|
408
423
|
await this.options.adapter.deliverInitialPrompt(this.process, prompt);
|
|
409
424
|
} catch (error) {
|
|
410
|
-
|
|
425
|
+
logger.error("runner", `initial prompt delivery failed: ${error}`);
|
|
411
426
|
}
|
|
412
427
|
}
|
|
413
428
|
|
|
@@ -444,7 +459,7 @@ export class AgentRunner {
|
|
|
444
459
|
status: "in_progress",
|
|
445
460
|
agentId: this.agentId,
|
|
446
461
|
metadata: { messageId: message.id, completedBy: "runner" },
|
|
447
|
-
}).catch((error) =>
|
|
462
|
+
}).catch((error) => logger.error("task", `task ${taskId} in_progress update failed: ${error}`));
|
|
448
463
|
// Runner owns claim + status here; drop the server's self-claim instruction
|
|
449
464
|
// so the agent doesn't improvise a stray claim send (see stripRunnerClaimedGuidance).
|
|
450
465
|
toDeliver = { ...message, body: stripRunnerClaimedGuidance(message.body) };
|
|
@@ -462,7 +477,7 @@ export class AgentRunner {
|
|
|
462
477
|
try {
|
|
463
478
|
const prepared = await messagesWithCachedAttachments(deliverable, this.http, {
|
|
464
479
|
agentId: this.agentId,
|
|
465
|
-
onError: (message) =>
|
|
480
|
+
onError: (message) => logger.error("runner", message),
|
|
466
481
|
});
|
|
467
482
|
await this.options.adapter.deliver(this.process, prepared);
|
|
468
483
|
for (const message of deliverable) {
|
|
@@ -471,7 +486,7 @@ export class AgentRunner {
|
|
|
471
486
|
}
|
|
472
487
|
} catch (error) {
|
|
473
488
|
failed = true;
|
|
474
|
-
if (shouldLogDeliveryFailure(error))
|
|
489
|
+
if (shouldLogDeliveryFailure(error)) logger.warn("delivery", `message delivery failed: ${error}`);
|
|
475
490
|
for (const message of deliverable) {
|
|
476
491
|
this.clearActiveClaim(message);
|
|
477
492
|
this.pendingMessages.set(message.id, message);
|
|
@@ -539,7 +554,7 @@ export class AgentRunner {
|
|
|
539
554
|
await this.http.deleteAgent(this.agentId).catch(() => {});
|
|
540
555
|
if (this.options.exitProcessOnShutdown !== false) {
|
|
541
556
|
setTimeout(() => void this.stop().catch((error) => {
|
|
542
|
-
|
|
557
|
+
logger.error("lifecycle", `stop after command failed: ${error}`);
|
|
543
558
|
}).finally(() => process.exit(0)), 10);
|
|
544
559
|
}
|
|
545
560
|
} else if (!this.stopped) {
|
|
@@ -674,7 +689,7 @@ export class AgentRunner {
|
|
|
674
689
|
|
|
675
690
|
if (this.shouldStopUnexpectedProviderExit(diagnostics)) {
|
|
676
691
|
const hasResumeId = typeof diagnostics.claudeResumeId === "string" && diagnostics.claudeResumeId.length > 0;
|
|
677
|
-
|
|
692
|
+
logger.warn("lifecycle", `${this.options.provider} exited; leaving agent offline for manual recovery`);
|
|
678
693
|
this.publishRunnerTimelineEvent({
|
|
679
694
|
status: "provider.restart_decision",
|
|
680
695
|
id: `provider-restart-decision-${this.providerSessionId}-${now}`,
|
|
@@ -702,7 +717,7 @@ export class AgentRunner {
|
|
|
702
717
|
}
|
|
703
718
|
|
|
704
719
|
if (runtimeMs < RAPID_EXIT_MS && recent.length > MAX_RAPID_UNEXPECTED_EXITS) {
|
|
705
|
-
|
|
720
|
+
logger.error("lifecycle", `provider session exited ${recent.length} times within ${Math.round(UNEXPECTED_EXIT_WINDOW_MS / 1000)}s; giving up`);
|
|
706
721
|
this.publishRunnerTimelineEvent({
|
|
707
722
|
status: "provider.restart_decision",
|
|
708
723
|
id: `provider-restart-decision-${this.providerSessionId}-${now}`,
|
|
@@ -726,7 +741,7 @@ export class AgentRunner {
|
|
|
726
741
|
}
|
|
727
742
|
|
|
728
743
|
const delayMs = Math.min(10_000, Math.max(500, 500 * recent.length));
|
|
729
|
-
|
|
744
|
+
logger.warn("lifecycle", `provider session exited unexpectedly after ${Math.round(runtimeMs / 1000)}s; restarting in ${delayMs}ms`);
|
|
730
745
|
this.publishRunnerTimelineEvent({
|
|
731
746
|
status: "provider.restart_decision",
|
|
732
747
|
id: `provider-restart-decision-${this.providerSessionId}-${now}`,
|
|
@@ -751,7 +766,7 @@ export class AgentRunner {
|
|
|
751
766
|
this.publishStatus();
|
|
752
767
|
this.scheduleDrain();
|
|
753
768
|
} catch (error) {
|
|
754
|
-
|
|
769
|
+
logger.error("lifecycle", `provider restart after unexpected exit failed: ${error}`);
|
|
755
770
|
this.setProviderStatus("error");
|
|
756
771
|
this.options.onProviderExit?.(1);
|
|
757
772
|
}
|
|
@@ -826,10 +841,10 @@ export class AgentRunner {
|
|
|
826
841
|
private handleBusError(code: string, message: string): void {
|
|
827
842
|
const action = runnerBusErrorAction(code, this.stopped);
|
|
828
843
|
if (action === "ignore") return;
|
|
829
|
-
|
|
844
|
+
logger.error("bus", `bus error ${code}: ${message}`);
|
|
830
845
|
if (action === "stop") {
|
|
831
846
|
void this.stop().catch((error) => {
|
|
832
|
-
|
|
847
|
+
logger.error("bus", `stop after bus error failed: ${error}`);
|
|
833
848
|
}).finally(() => process.exit(0));
|
|
834
849
|
}
|
|
835
850
|
}
|
|
@@ -1146,18 +1161,21 @@ export class AgentRunner {
|
|
|
1146
1161
|
let activity: "busy" | "idle" | "unknown";
|
|
1147
1162
|
try { activity = await this.options.adapter.probeActivity(this.process); } catch { return; }
|
|
1148
1163
|
if (activity === "busy") this.busyReconcileSawBusy = true;
|
|
1149
|
-
// Reset the streak on anything that isn't a confident idle
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
if (activity !== "idle") this.busyReconcileIdleStreak = 0;
|
|
1164
|
+
// Reset the streak on anything that isn't a confident idle.
|
|
1165
|
+
if (activity !== "idle") {
|
|
1166
|
+
this.busyReconcileIdleStreak = 0;
|
|
1153
1167
|
this.sessionDebug(`reconcile probe=${activity} sawBusy=${this.busyReconcileSawBusy} streak=${this.busyReconcileIdleStreak}`);
|
|
1154
1168
|
return;
|
|
1155
1169
|
}
|
|
1156
1170
|
this.busyReconcileIdleStreak += 1;
|
|
1157
|
-
|
|
1158
|
-
|
|
1171
|
+
// Confirm faster once we've seen the spinner this turn; otherwise demand a much
|
|
1172
|
+
// longer all-idle window before trusting it (rescues fast turns without
|
|
1173
|
+
// false-clearing a live turn that simply hasn't flashed busy into a probe yet).
|
|
1174
|
+
const confirm = this.busyReconcileSawBusy ? BUSY_RECONCILE_IDLE_CONFIRM : BUSY_RECONCILE_IDLE_CONFIRM_NO_BUSY;
|
|
1175
|
+
this.sessionDebug(`reconcile probe=idle sawBusy=${this.busyReconcileSawBusy} streak=${this.busyReconcileIdleStreak}/${confirm}`);
|
|
1176
|
+
if (this.busyReconcileIdleStreak < confirm) return;
|
|
1159
1177
|
this.disarmBusyReconciler();
|
|
1160
|
-
this.forceClearProviderTurn("backstop reconciler");
|
|
1178
|
+
this.forceClearProviderTurn(this.busyReconcileSawBusy ? "backstop reconciler" : "backstop reconciler (no-busy-observed)");
|
|
1161
1179
|
}
|
|
1162
1180
|
|
|
1163
1181
|
// Force-clear a stuck provider-turn claim directly. Unlike the idle status path
|
|
@@ -1354,36 +1372,24 @@ export class AgentRunner {
|
|
|
1354
1372
|
this.logRunnerDiagnostic(`[runner] HTTP liveness update failed: ${suffix}`);
|
|
1355
1373
|
}
|
|
1356
1374
|
|
|
1375
|
+
// Runner operational diagnostics (HTTP liveness, token renewal failures). Routed
|
|
1376
|
+
// through the leveled logger at warn — see logger.ts. Kept as a thin wrapper so
|
|
1377
|
+
// the existing call sites and their `[runner]` framing stay put.
|
|
1357
1378
|
private logRunnerDiagnostic(message: string): void {
|
|
1358
|
-
|
|
1359
|
-
console.error(message);
|
|
1360
|
-
return;
|
|
1361
|
-
}
|
|
1362
|
-
try {
|
|
1363
|
-
const logDir = join(process.env.HOME || ".", ".agent-relay", "logs");
|
|
1364
|
-
mkdirSync(logDir, { recursive: true });
|
|
1365
|
-
appendFileSync(join(logDir, `runner-${safeLogName(this.agentId)}.log`), `[${new Date().toISOString()}] ${message}\n`);
|
|
1366
|
-
} catch {
|
|
1367
|
-
// Do not write runner diagnostics into an interactive provider TUI.
|
|
1368
|
-
}
|
|
1379
|
+
logger.warn("runner", message.replace(/^\[runner\]\s*/, ""));
|
|
1369
1380
|
}
|
|
1370
1381
|
|
|
1371
|
-
// Session-mirror diagnostics →
|
|
1372
|
-
//
|
|
1373
|
-
// to look when chat/terminal sync misbehaves.
|
|
1382
|
+
// Session-mirror diagnostics → the leveled logger (component "mirror"), written
|
|
1383
|
+
// to the dashboard-surfaced session-mirror-<agent>.log. Key transitions log at
|
|
1384
|
+
// info; the single place to look when chat/terminal sync misbehaves.
|
|
1374
1385
|
private sessionLog(message: string): void {
|
|
1375
|
-
|
|
1376
|
-
const logDir = join(process.env.HOME || ".", ".agent-relay", "logs");
|
|
1377
|
-
mkdirSync(logDir, { recursive: true });
|
|
1378
|
-
appendFileSync(join(logDir, `session-mirror-${safeLogName(this.agentId)}.log`), `[${new Date().toISOString()}] ${message}\n`);
|
|
1379
|
-
} catch {
|
|
1380
|
-
// best-effort
|
|
1381
|
-
}
|
|
1386
|
+
logger.info("mirror", message);
|
|
1382
1387
|
}
|
|
1383
1388
|
|
|
1384
|
-
// Verbose, high-frequency lines (per-probe, per-emit) — only
|
|
1389
|
+
// Verbose, high-frequency lines (per-probe, per-emit) — surfaced only at log
|
|
1390
|
+
// level "debug" (AGENT_RELAY_LOG_LEVEL=debug, or flip live via /log-level).
|
|
1385
1391
|
private sessionDebug(message: string): void {
|
|
1386
|
-
|
|
1392
|
+
logger.debug("mirror", message);
|
|
1387
1393
|
}
|
|
1388
1394
|
|
|
1389
1395
|
private ensureScratch(): void {
|
|
@@ -1648,7 +1654,7 @@ export class AgentRunner {
|
|
|
1648
1654
|
})
|
|
1649
1655
|
.then(() => true)
|
|
1650
1656
|
.catch((error) => {
|
|
1651
|
-
|
|
1657
|
+
logger.error("task", `task ${claim.taskId} completion update failed: ${error}`);
|
|
1652
1658
|
return false;
|
|
1653
1659
|
});
|
|
1654
1660
|
if (!ok) continue;
|
|
@@ -1953,10 +1959,6 @@ function httpErrorKey(error: unknown): string {
|
|
|
1953
1959
|
return String(error);
|
|
1954
1960
|
}
|
|
1955
1961
|
|
|
1956
|
-
function safeLogName(value: string): string {
|
|
1957
|
-
return value.replace(/[^a-zA-Z0-9_.-]+/g, "_").slice(0, 180);
|
|
1958
|
-
}
|
|
1959
|
-
|
|
1960
1962
|
function isContextState(value: unknown): value is ContextState {
|
|
1961
1963
|
if (!value || typeof value !== "object" || Array.isArray(value)) return false;
|
|
1962
1964
|
const state = value as Record<string, unknown>;
|