agent-relay-orchestrator 0.62.2 → 0.63.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-relay-orchestrator",
3
- "version": "0.62.2",
3
+ "version": "0.63.0",
4
4
  "description": "Agent Relay orchestrator — manages agent lifecycle across hosts",
5
5
  "type": "module",
6
6
  "bin": {
@@ -16,7 +16,7 @@
16
16
  "test": "bun test"
17
17
  },
18
18
  "dependencies": {
19
- "agent-relay-sdk": "0.2.39"
19
+ "agent-relay-sdk": "0.2.41"
20
20
  },
21
21
  "devDependencies": {
22
22
  "@types/bun": "latest",
@@ -0,0 +1,105 @@
1
+ import { existsSync } from "node:fs";
2
+ import { homedir } from "node:os";
3
+ import { isAbsolute, join, relative, resolve } from "node:path";
4
+ import { artifactProxyBaseUrl } from "../artifact-proxy";
5
+ import type { OrchestratorConfig } from "../config";
6
+ import { sanitizeFsName } from "agent-relay-sdk/fs-name";
7
+ import type { SpawnOptions } from "./types";
8
+
9
+ export function isWithinBaseDir(path: string, baseDir: string): boolean {
10
+ const base = resolve(baseDir);
11
+ const target = resolve(path);
12
+ const rel = relative(base, target);
13
+ return rel === "" || (!!rel && !rel.startsWith("..") && !isAbsolute(rel));
14
+ }
15
+
16
+ export function sessionName(config: OrchestratorConfig, provider: string, label: string, uniqueId?: string): string {
17
+ const clean = sanitizeFsName(label, { replacement: "-", lowercase: true });
18
+ const suffix = uniqueId ? `-${sanitizeFsName(uniqueId, { replacement: "-", lowercase: true }).slice(-8)}` : "";
19
+ return `${config.tmuxPrefix}-${provider}-${clean}${suffix}`;
20
+ }
21
+
22
+ export function defaultSpawnLabel(now = Date.now()): string {
23
+ return `session-${now}`;
24
+ }
25
+
26
+ export function buildRunnerCommand(opts: SpawnOptions, config: OrchestratorConfig): string[] {
27
+ const repoLauncher = resolve(import.meta.dir, "../../../runner/src/index.ts");
28
+ const installedLauncher = resolve(import.meta.dir, "../../../agent-relay-runner/src/index.ts");
29
+ const bun = process.env.AGENT_RELAY_BUN_BIN
30
+ || (process.platform === "darwin" && existsSync("/opt/homebrew/bin/bun") ? "/opt/homebrew/bin/bun" : "bun");
31
+ const launcher = existsSync(repoLauncher)
32
+ ? [bun, "run", repoLauncher, opts.provider]
33
+ : existsSync(installedLauncher)
34
+ ? [bun, "run", installedLauncher, opts.provider]
35
+ : [`${opts.provider}-relay`, opts.provider];
36
+ const args = [
37
+ ...launcher,
38
+ "--headless",
39
+ "--cwd", opts.cwd,
40
+ "--relay-url", config.relayUrl,
41
+ "--approval", opts.approvalMode || "guarded",
42
+ ];
43
+ if (opts.rig) args.push("--rig", opts.rig);
44
+ if (opts.model) args.push("--model", opts.model);
45
+ if (opts.effort) args.push("--effort", opts.effort);
46
+ if (opts.profile) args.push("--profile", opts.profile);
47
+ if (opts.label) args.push("--label", opts.label);
48
+ if (opts.agentId) args.push("--agent-id", opts.agentId);
49
+ if (opts.prompt) args.push("--prompt", opts.prompt);
50
+ if (opts.systemPromptAppend) args.push("--system-prompt-append", opts.systemPromptAppend);
51
+ if (opts.tags?.length) args.push("--tags", opts.tags.join(","));
52
+ if (opts.capabilities?.length) args.push("--caps", opts.capabilities.join(","));
53
+ if (opts.providerArgs?.length) args.push("--", ...opts.providerArgs);
54
+ return args;
55
+ }
56
+
57
+ export function buildEnv(opts: SpawnOptions & { label: string; agentId: string }, config: OrchestratorConfig, logFile?: string, tmuxSession?: string): Record<string, string> {
58
+ const currentPath = process.env.PATH || "";
59
+ const extraPaths = [
60
+ join(homedir(), ".local", "bin"),
61
+ join(homedir(), ".bun", "bin"),
62
+ join(homedir(), ".npm-global", "bin"),
63
+ ];
64
+ const fullPath = [...extraPaths, ...currentPath.split(":").filter(Boolean)]
65
+ .filter((v, i, a) => a.indexOf(v) === i)
66
+ .join(":");
67
+
68
+ return {
69
+ ...process.env as Record<string, string>,
70
+ ...(config.token ? { AGENT_RELAY_TOKEN: config.token } : {}),
71
+ ...config.env,
72
+ ...agentProfileEnv(opts.agentProfile),
73
+ ...(opts.env || {}),
74
+ PATH: fullPath,
75
+ AGENT_RELAY_URL: config.relayUrl,
76
+ AGENT_RELAY_ORCHESTRATOR_URL: `http://127.0.0.1:${config.apiPort}`,
77
+ AGENT_RELAY_ARTIFACT_URL: artifactProxyBaseUrl(config),
78
+ AGENT_RELAY_APPROVAL: opts.approvalMode || "guarded",
79
+ ...(opts.profile ? { AGENT_RELAY_AGENT_PROFILE: opts.profile } : {}),
80
+ ...(opts.agentProfile ? { AGENT_RELAY_AGENT_PROFILE_JSON: JSON.stringify(opts.agentProfile) } : {}),
81
+ // #330 — tag by TRUE origin. An MCP spawn (an agent spawning a helper) is `agent-spawned`, not
82
+ // `dashboard-spawned`; the old blanket `dashboard-spawned` mislabeled every headless spawn as
83
+ // dashboard-originated. Dashboard/CLI/automation spawns (no `requestedVia: "mcp"`) keep the
84
+ // `dashboard-spawned` tag the smoke test and UI filter on.
85
+ AGENT_RELAY_TAGS: [...new Set(["headless", opts.requestedVia === "mcp" ? "agent-spawned" : "dashboard-spawned", config.hostname, ...(opts.tags ?? [])])].join(","),
86
+ AGENT_RELAY_CAPS: [...new Set(opts.capabilities ?? [])].join(","),
87
+ AGENT_RELAY_CAPABILITIES: [...new Set(opts.capabilities ?? [])].join(","),
88
+ AGENT_RELAY_HEADLESS: "1",
89
+ ...(logFile ? { AGENT_RELAY_LOG_FILE: logFile } : {}),
90
+ ...(tmuxSession ? { AGENT_RELAY_TMUX_SESSION: tmuxSession } : {}),
91
+ ...(opts.label ? { AGENT_RELAY_LABEL: opts.label } : {}),
92
+ ...(opts.policyName ? { AGENT_RELAY_POLICY: opts.policyName } : {}),
93
+ ...(opts.spawnRequestId ? { AGENT_RELAY_SPAWN_REQUEST_ID: opts.spawnRequestId } : {}),
94
+ AGENT_RELAY_LIFECYCLE: opts.lifecycle ?? "persistent", AGENT_RELAY_WORKSPACE_MODE: opts.workspaceMode ?? "inherit",
95
+ ...(opts.workspace ? { AGENT_RELAY_WORKSPACE_JSON: JSON.stringify(opts.workspace) } : {}),
96
+ ...(opts.automationId ? { AGENT_RELAY_AUTOMATION_ID: opts.automationId } : {}),
97
+ ...(opts.automationRunId ? { AGENT_RELAY_AUTOMATION_RUN_ID: opts.automationRunId } : {}),
98
+ };
99
+ }
100
+
101
+ function agentProfileEnv(profile: Record<string, unknown> | undefined): Record<string, string> {
102
+ const raw = profile?.env;
103
+ if (!raw || typeof raw !== "object" || Array.isArray(raw)) return {};
104
+ return Object.fromEntries(Object.entries(raw).filter((entry): entry is [string, string] => typeof entry[1] === "string"));
105
+ }
@@ -0,0 +1,13 @@
1
+ import { homedir } from "node:os";
2
+ import { join } from "node:path";
3
+
4
+ export const LOG_DIR = join(homedir(), ".agent-relay", "logs");
5
+ export const STATE_FILE = join(homedir(), ".agent-relay", "orchestrator-sessions.json");
6
+ export const SESSION_DIR = join(homedir(), ".agent-relay", "sessions");
7
+ export const RUNNER_INFO_DIR = join(homedir(), ".agent-relay", "runners");
8
+ export const GUEST_TTL_MS = 60 * 60 * 1000;
9
+ export const GUEST_STATE_FILE = join(homedir(), ".agent-relay", "orchestrator-guests.json");
10
+ export const terminalGuests = new Map<string, { expiresAt: number }>();
11
+ export let guestStateHydrated = false;
12
+
13
+ export function markGuestStateHydrated(): void { guestStateHydrated = true; }
@@ -0,0 +1,241 @@
1
+ import { mkdirSync, readFileSync, renameSync, writeFileSync } from "node:fs";
2
+ import { homedir } from "node:os";
3
+ import { join } from "node:path";
4
+ import type { OrchestratorConfig } from "../config";
5
+ import { errMessage } from "agent-relay-sdk";
6
+ import { shellEscape } from "agent-relay-sdk/shell-utils";
7
+ import { tmuxHasSession } from "agent-relay-sdk/tmux-utils";
8
+ import { sanitizeFsName } from "agent-relay-sdk/fs-name";
9
+ import { GUEST_STATE_FILE, GUEST_TTL_MS, guestStateHydrated, markGuestStateHydrated, terminalGuests } from "./constants";
10
+ import { findSessionRecord, isSessionRecordAlive, readRunnerInfo } from "./runtime";
11
+ import { isWithinBaseDir } from "./command";
12
+ import type { TerminalAttachSpec, TerminalGuestSession } from "./types";
13
+
14
+ export async function createTerminalGuest(
15
+ input: { agentId?: string; policyName?: string; spawnRequestId?: string; tmuxSession?: string },
16
+ config: OrchestratorConfig,
17
+ ): Promise<TerminalGuestSession> {
18
+ cleanupExpiredTerminalGuests();
19
+ const record = findSessionRecord(input);
20
+ if (!record || !isSessionRecordAlive(record)) throw new Error("managed runner session not found");
21
+ const runner = readRunnerInfo(record);
22
+ if (!runner?.controlUrl) throw new Error("runner control URL is unavailable; restart the agent to enable terminal attach");
23
+ const spec = await fetchTerminalAttachSpec(runner.controlUrl);
24
+ validateAttachSpec(spec, config);
25
+ const session = guestSessionName(config, spec.provider, record.agentId);
26
+ killTmuxSession(session);
27
+ const expiresAt = Date.now() + Math.min(Math.max(spec.ttlMs ?? GUEST_TTL_MS, 60_000), 4 * GUEST_TTL_MS);
28
+ const shellCmd = spec.command.map(shellEscape).join(" ");
29
+ const tmuxArgs = ["new-session", "-d", "-s", session, "-x", "200", "-y", "50"];
30
+ for (const [key, value] of Object.entries(spec.env ?? {}).sort(([a], [b]) => a.localeCompare(b))) {
31
+ if (/^[A-Za-z_][A-Za-z0-9_]*$/.test(key)) tmuxArgs.push("-e", `${key}=${value}`);
32
+ }
33
+ tmuxArgs.push("-c", spec.cwd, shellCmd);
34
+ const result = Bun.spawnSync(["tmux", ...tmuxArgs], {
35
+ stdin: "ignore",
36
+ stdout: "pipe",
37
+ stderr: "pipe",
38
+ });
39
+ if (result.exitCode !== 0) {
40
+ const stderr = result.stderr.toString().trim();
41
+ throw new Error(stderr || `tmux guest creation failed with exit code ${result.exitCode}`);
42
+ }
43
+ terminalGuests.set(session, { expiresAt });
44
+ saveGuestState();
45
+ return { session, mode: "guest", provider: spec.provider, running: true, interactive: true, expiresAt };
46
+ }
47
+
48
+ export function stopTerminalGuest(session: string, config: OrchestratorConfig): { session: string; stopped: boolean } {
49
+ if (!isGuestSessionName(session, config)) throw new Error("terminal session is not a guest session");
50
+ const running = tmuxHasSession(session);
51
+ if (running) killTmuxSession(session);
52
+ terminalGuests.delete(session);
53
+ saveGuestState();
54
+ return { session, stopped: running };
55
+ }
56
+
57
+ async function fetchTerminalAttachSpec(controlUrl: string): Promise<TerminalAttachSpec> {
58
+ const res = await fetch(`${controlUrl}/terminal/attach-spec`, { signal: AbortSignal.timeout(5_000) });
59
+ const body = await res.json().catch(() => null) as unknown;
60
+ if (!res.ok) {
61
+ const message = body && typeof body === "object" && !Array.isArray(body) && typeof (body as { error?: unknown }).error === "string"
62
+ ? (body as { error: string }).error
63
+ : `runner attach-spec failed with ${res.status}`;
64
+ throw new Error(message);
65
+ }
66
+ if (!body || typeof body !== "object" || Array.isArray(body)) throw new Error("runner attach-spec response must be an object");
67
+ return body as TerminalAttachSpec;
68
+ }
69
+
70
+ function validateAttachSpec(spec: TerminalAttachSpec, config: OrchestratorConfig): void {
71
+ if (spec.mode !== "guest") throw new Error("runner attach-spec mode must be guest");
72
+ if (typeof spec.provider !== "string" || !spec.provider.trim()) throw new Error("runner attach-spec provider required");
73
+ if (typeof spec.cwd !== "string" || !isWithinBaseDir(spec.cwd, config.baseDir)) throw new Error("runner attach-spec cwd must be within base directory");
74
+ if (!Array.isArray(spec.command) || spec.command.length === 0 || spec.command.some((item) => typeof item !== "string" || !item)) {
75
+ throw new Error("runner attach-spec command must be a non-empty string array");
76
+ }
77
+ if (spec.env !== undefined && (!spec.env || typeof spec.env !== "object" || Array.isArray(spec.env) || Object.values(spec.env).some((value) => typeof value !== "string"))) {
78
+ throw new Error("runner attach-spec env must be a string record");
79
+ }
80
+ }
81
+
82
+ function guestSessionName(config: OrchestratorConfig, provider: string, agentId: string): string {
83
+ const cleanProvider = sanitizeFsName(provider, { replacement: "-", lowercase: true, fallback: "provider" });
84
+ const cleanAgent = sanitizeFsName(agentId, { replacement: "-", lowercase: true, maxLen: 48, fallback: "agent" });
85
+ return `${config.tmuxPrefix}-guest-${cleanProvider}-${cleanAgent}-${crypto.randomUUID().slice(0, 8)}`;
86
+ }
87
+
88
+ function isGuestSessionName(session: string, config: OrchestratorConfig): boolean {
89
+ return session.startsWith(`${config.tmuxPrefix}-guest-`);
90
+ }
91
+
92
+ interface GuestRecord {
93
+ session: string;
94
+ expiresAt: number;
95
+ }
96
+
97
+ interface LiveGuestSession {
98
+ session: string;
99
+ createdAtMs: number;
100
+ }
101
+
102
+ /** Flatten the in-memory guest registry to a persistable, deterministic list. */
103
+ export function serializeGuests(guests: Map<string, { expiresAt: number }>): GuestRecord[] {
104
+ return [...guests.entries()]
105
+ .map(([session, { expiresAt }]) => ({ session, expiresAt }))
106
+ .sort((a, b) => a.session.localeCompare(b.session));
107
+ }
108
+
109
+ /** Tolerant inverse of serializeGuests — drops malformed entries instead of throwing. */
110
+ export function deserializeGuests(raw: unknown): Map<string, { expiresAt: number }> {
111
+ const map = new Map<string, { expiresAt: number }>();
112
+ if (!Array.isArray(raw)) return map;
113
+ for (const entry of raw) {
114
+ if (!entry || typeof entry !== "object") continue;
115
+ const { session, expiresAt } = entry as Record<string, unknown>;
116
+ if (typeof session === "string" && session && typeof expiresAt === "number" && Number.isFinite(expiresAt)) {
117
+ map.set(session, { expiresAt });
118
+ }
119
+ }
120
+ return map;
121
+ }
122
+
123
+ function saveGuestState(): void {
124
+ try {
125
+ mkdirSync(join(homedir(), ".agent-relay"), { recursive: true });
126
+ const tmp = `${GUEST_STATE_FILE}.tmp`;
127
+ writeFileSync(tmp, JSON.stringify(serializeGuests(terminalGuests), null, 2) + "\n");
128
+ renameSync(tmp, GUEST_STATE_FILE);
129
+ } catch {
130
+ // Persistence is best-effort: a write failure must never break guest creation.
131
+ // The periodic reaper's tmux age-based fallback still bounds orphan lifetime.
132
+ }
133
+ }
134
+
135
+ /**
136
+ * Rehydrate the in-memory guest registry from disk so guest TTLs survive an
137
+ * orchestrator restart. Call once at boot before the first reap.
138
+ */
139
+ export function hydrateTerminalGuests(): void {
140
+ if (guestStateHydrated) return;
141
+ markGuestStateHydrated();
142
+ try {
143
+ const persisted = deserializeGuests(JSON.parse(readFileSync(GUEST_STATE_FILE, "utf8")));
144
+ for (const [session, value] of persisted) {
145
+ if (!terminalGuests.has(session)) terminalGuests.set(session, value);
146
+ }
147
+ } catch {
148
+ // No persisted state (first boot or unreadable) — the age-based fallback in
149
+ // reapTerminalGuests still cleans any orphaned guest tmux sessions.
150
+ }
151
+ }
152
+
153
+ /** Live `<prefix>-guest-*` tmux sessions with their creation time (ms). */
154
+ function listGuestTmuxSessions(config: OrchestratorConfig): LiveGuestSession[] {
155
+ const result = Bun.spawnSync(["tmux", "list-sessions", "-F", "#{session_name}\t#{session_created}"], {
156
+ stdin: "ignore",
157
+ stdout: "pipe",
158
+ stderr: "ignore",
159
+ });
160
+ if (result.exitCode !== 0) return []; // no tmux server / no sessions
161
+ const sessions: LiveGuestSession[] = [];
162
+ for (const line of result.stdout.toString().split("\n")) {
163
+ const tab = line.indexOf("\t");
164
+ if (tab < 0) continue;
165
+ const session = line.slice(0, tab);
166
+ if (!isGuestSessionName(session, config)) continue;
167
+ const createdSec = Number(line.slice(tab + 1).trim());
168
+ sessions.push({ session, createdAtMs: Number.isFinite(createdSec) ? createdSec * 1000 : 0 });
169
+ }
170
+ return sessions;
171
+ }
172
+
173
+ /**
174
+ * Decide which live guest sessions to reap. Pure so the TTL policy is testable
175
+ * without tmux or fs:
176
+ * - tracked + past its recorded expiry → reap
177
+ * - untracked (metadata lost across a restart) + older than the fallback TTL → reap
178
+ */
179
+ export function selectExpiredGuests(
180
+ tracked: Map<string, { expiresAt: number }>,
181
+ liveGuests: LiveGuestSession[],
182
+ now: number,
183
+ fallbackTtlMs = GUEST_TTL_MS,
184
+ ): string[] {
185
+ const toReap = new Set<string>();
186
+ for (const { session, createdAtMs } of liveGuests) {
187
+ const record = tracked.get(session);
188
+ if (record) {
189
+ if (record.expiresAt <= now) toReap.add(session);
190
+ } else if (now - createdAtMs >= fallbackTtlMs) {
191
+ toReap.add(session);
192
+ }
193
+ }
194
+ return [...toReap];
195
+ }
196
+
197
+ /**
198
+ * Kill guest tmux sessions whose TTL has elapsed, independent of any new guest
199
+ * creation, and prune tracked entries whose tmux session is already gone. Runs
200
+ * at boot and on a periodic timer (see orchestrator index).
201
+ */
202
+ export function reapTerminalGuests(config: OrchestratorConfig, now = Date.now()): string[] {
203
+ const live = listGuestTmuxSessions(config);
204
+ const liveNames = new Set(live.map((g) => g.session));
205
+ const reaped = selectExpiredGuests(terminalGuests, live, now);
206
+ for (const session of reaped) {
207
+ killTmuxSession(session);
208
+ terminalGuests.delete(session);
209
+ }
210
+ // Drop tracked guests with no live tmux session (manually killed, or reaped
211
+ // above) so the registry can't grow without bound.
212
+ let pruned = false;
213
+ for (const session of [...terminalGuests.keys()]) {
214
+ if (!liveNames.has(session)) {
215
+ terminalGuests.delete(session);
216
+ pruned = true;
217
+ }
218
+ }
219
+ if (reaped.length || pruned) saveGuestState();
220
+ return reaped;
221
+ }
222
+
223
+ function cleanupExpiredTerminalGuests(): void {
224
+ const now = Date.now();
225
+ let changed = false;
226
+ for (const [session, guest] of terminalGuests.entries()) {
227
+ if (guest.expiresAt > now) continue;
228
+ killTmuxSession(session);
229
+ terminalGuests.delete(session);
230
+ changed = true;
231
+ }
232
+ if (changed) saveGuestState();
233
+ }
234
+
235
+ function killTmuxSession(session: string): void {
236
+ Bun.spawnSync(["tmux", "kill-session", "-t", session], {
237
+ stdin: "ignore",
238
+ stdout: "ignore",
239
+ stderr: "ignore",
240
+ });
241
+ }
@@ -0,0 +1,10 @@
1
+ export * from "./command";
2
+ export * from "./guests";
3
+ export * from "./log-utils";
4
+ export * from "./runtime";
5
+ export * from "./sessions";
6
+ export * from "./spawn-agent";
7
+ export * from "./supervisor";
8
+ export * from "./systemd";
9
+ export * from "./terminal";
10
+ export type { SessionRecord, SpawnOptions, TerminalSnapshot } from "./types";
@@ -0,0 +1,22 @@
1
+ export function logLines(content: string, sanitize = true): string[] {
2
+ const text = sanitize ? sanitizeLogText(content) : content.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
3
+ return text
4
+ .split("\n")
5
+ .map((line) => line.trimEnd())
6
+ .filter((line) => line.trim().length > 0);
7
+ }
8
+
9
+ export function sanitizeLogText(content: string): string {
10
+ return content
11
+ .replace(/\x1B\][^\x07\x1B]*(?:\x07|\x1B\\)/g, "")
12
+ .replace(/\x1B[PX^_][\s\S]*?\x1B\\/g, "")
13
+ .replace(/\x1B\[(\d*)C/g, (_match, count: string) => " ".repeat(Math.min(Number(count || "1"), 120)))
14
+ .replace(/\x1B\[[0-?]*[ -/]*[@-~]/g, "")
15
+ .replace(/\x1B[()#%*+\-.\/ ][ -~]/g, "")
16
+ .replace(/\x1B[ -/]*[@-~]/g, "")
17
+ .replace(/\x9B[0-?]*[ -/]*[@-~]/g, "")
18
+ .replace(/\x1B.?/g, "")
19
+ .replace(/\r\n/g, "\n")
20
+ .replace(/\r/g, "\n")
21
+ .replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "");
22
+ }
@@ -0,0 +1,151 @@
1
+ import { existsSync, mkdirSync, readFileSync, renameSync, writeFileSync } from "node:fs";
2
+ import { homedir } from "node:os";
3
+ import { join } from "node:path";
4
+ import type { ManagedAgentReport } from "../relay";
5
+ import { isPidAlive, parseProcStateIsZombie } from "agent-relay-sdk/process-utils";
6
+ import { tmuxHasSession } from "agent-relay-sdk/tmux-utils";
7
+ import { LOG_DIR, RUNNER_INFO_DIR, SESSION_DIR, STATE_FILE } from "./constants";
8
+ import { systemdMainPid } from "./systemd";
9
+ import { sanitizeFsName } from "agent-relay-sdk/fs-name";
10
+ import type { RunnerInfo, SessionRecord, SessionSupervisor } from "./types";
11
+
12
+ export function logFilePath(name: string): string {
13
+ return join(LOG_DIR, `${name}.log`);
14
+ }
15
+
16
+ export function runnerInfoPath(name: string): string {
17
+ const safe = sanitizeFsName(name, { replacement: "-", trimEdge: true, fallback: "runner" });
18
+ return join(RUNNER_INFO_DIR, `${safe}.json`);
19
+ }
20
+
21
+ export function ensureLogDir(): void {
22
+ mkdirSync(LOG_DIR, { recursive: true });
23
+ }
24
+
25
+ export function ensureSessionDir(): void {
26
+ mkdirSync(SESSION_DIR, { recursive: true, mode: 0o700 });
27
+ }
28
+
29
+ export function ensureRunnerInfoDir(): void {
30
+ mkdirSync(RUNNER_INFO_DIR, { recursive: true, mode: 0o700 });
31
+ }
32
+
33
+ export function saveState(records: SessionRecord[]): void {
34
+ mkdirSync(join(homedir(), ".agent-relay"), { recursive: true });
35
+ // Atomic write: a crash mid-write would otherwise leave truncated JSON and
36
+ // loadState would silently return [], losing every tracked session.
37
+ const tmp = `${STATE_FILE}.tmp`;
38
+ writeFileSync(tmp, JSON.stringify(records, null, 2) + "\n");
39
+ renameSync(tmp, STATE_FILE);
40
+ }
41
+
42
+ export function loadState(): SessionRecord[] {
43
+ try {
44
+ return JSON.parse(readFileSync(STATE_FILE, "utf8"));
45
+ } catch {
46
+ return [];
47
+ }
48
+ }
49
+
50
+ export function addSessionRecord(record: SessionRecord): void {
51
+ const records = loadState().filter((r) => r.name !== record.name);
52
+ records.push(record);
53
+ saveState(records);
54
+ }
55
+
56
+ export function removeSessionRecord(name: string): void {
57
+ saveState(loadState().filter((r) => r.name !== name));
58
+ }
59
+
60
+ // Zombie-aware liveness primitives are shared with the runner via the SDK.
61
+ // Re-exported so existing `./spawn` consumers (and tests) keep resolving them.
62
+ export { isPidAlive, parseProcStateIsZombie };
63
+
64
+ export function sessionSupervisor(record?: Pick<SessionRecord, "supervisor">): SessionSupervisor {
65
+ return record?.supervisor ?? { type: "process" };
66
+ }
67
+
68
+ export function isSessionRecordAlive(record: SessionRecord): boolean {
69
+ const supervisor = sessionSupervisor(record);
70
+ if (supervisor.type === "systemd" && supervisor.unit) {
71
+ const pid = systemdMainPid(supervisor.unit);
72
+ return pid > 0 && isPidAlive(pid);
73
+ }
74
+ return isPidAlive(record.pid);
75
+ }
76
+
77
+ export function currentSessionPid(record: SessionRecord): number {
78
+ const supervisor = sessionSupervisor(record);
79
+ if (supervisor.type === "systemd" && supervisor.unit) {
80
+ const pid = systemdMainPid(supervisor.unit);
81
+ if (pid > 0) return pid;
82
+ }
83
+ return record.pid;
84
+ }
85
+
86
+ export function sessionReportFields(record: Pick<SessionRecord, "name" | "supervisor" | "runnerInfoFile" | "agentId" | "provider">): Pick<ManagedAgentReport, "sessionName" | "tmuxSession" | "supervisor" | "systemdUnit" | "terminalSession" | "terminalAvailable"> {
87
+ const supervisor = sessionSupervisor(record);
88
+ const terminalAvailable = tmuxHasSession(record.name, readRunnerInfo(record)?.tmuxSocket);
89
+ return {
90
+ sessionName: record.name,
91
+ tmuxSession: record.name,
92
+ supervisor: supervisor.type,
93
+ ...(supervisor.type === "systemd" && supervisor.unit ? { systemdUnit: supervisor.unit } : {}),
94
+ terminalSession: record.name,
95
+ terminalAvailable,
96
+ };
97
+ }
98
+
99
+ export function selectSessionRecord(records: SessionRecord[], input: { agentId?: string; policyName?: string; spawnRequestId?: string; tmuxSession?: string }): SessionRecord | undefined {
100
+ if (input.tmuxSession) return records.find((record) => record.name === input.tmuxSession);
101
+
102
+ if (input.spawnRequestId) {
103
+ return records.find((record) =>
104
+ record.spawnRequestId === input.spawnRequestId &&
105
+ (!input.policyName || record.policyName === input.policyName)
106
+ );
107
+ }
108
+
109
+ if (input.agentId) {
110
+ return records.find((record) =>
111
+ record.agentId === input.agentId &&
112
+ (!input.policyName || record.policyName === input.policyName)
113
+ );
114
+ }
115
+
116
+ if (input.policyName) {
117
+ const policyName = input.policyName;
118
+ return records
119
+ .filter((record) => record.policyName === policyName)
120
+ .reduce<SessionRecord | undefined>((latest, record) => (
121
+ !latest || record.startedAt > latest.startedAt ? record : latest
122
+ ), undefined);
123
+ }
124
+ return undefined;
125
+ }
126
+
127
+ export function findSessionRecord(input: { agentId?: string; policyName?: string; spawnRequestId?: string; tmuxSession?: string }): SessionRecord | undefined {
128
+ return selectSessionRecord(loadState(), input);
129
+ }
130
+
131
+ export function readRunnerInfo(record: Pick<SessionRecord, "runnerInfoFile" | "agentId" | "provider">): RunnerInfo | null {
132
+ if (!record.runnerInfoFile) return null;
133
+ try {
134
+ const parsed = JSON.parse(readFileSync(record.runnerInfoFile, "utf8"));
135
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) return null;
136
+ const info = parsed as Record<string, unknown>;
137
+ if (typeof info.controlUrl !== "string" || !info.controlUrl.startsWith("http://127.0.0.1:")) return null;
138
+ return {
139
+ agentId: typeof info.agentId === "string" ? info.agentId : record.agentId,
140
+ runnerId: typeof info.runnerId === "string" ? info.runnerId : "",
141
+ provider: typeof info.provider === "string" ? info.provider : record.provider,
142
+ controlUrl: info.controlUrl,
143
+ tmuxSession: typeof info.tmuxSession === "string" ? info.tmuxSession : undefined,
144
+ tmuxSocket: typeof info.tmuxSocket === "string" ? info.tmuxSocket : undefined,
145
+ pid: typeof info.pid === "number" ? info.pid : undefined,
146
+ startedAt: typeof info.startedAt === "number" ? info.startedAt : undefined,
147
+ };
148
+ } catch {
149
+ return null;
150
+ }
151
+ }
@@ -0,0 +1,103 @@
1
+ import type { OrchestratorConfig } from "../config";
2
+ import type { ManagedAgentReport } from "../relay";
3
+ import { sanitizeFsName } from "agent-relay-sdk/fs-name";
4
+ import { shellEscape } from "agent-relay-sdk/shell-utils";
5
+ import { tmuxHasSession } from "agent-relay-sdk/tmux-utils";
6
+ import { cleanupSessionRecord } from "./supervisor";
7
+ import { currentSessionPid, findSessionRecord, isSessionRecordAlive, loadState, readRunnerInfo, saveState, sessionReportFields, sessionSupervisor } from "./runtime";
8
+ import type { SessionInfo, SessionRecord } from "./types";
9
+
10
+ export function listSessions(prefix: string): SessionInfo[] {
11
+ return loadState()
12
+ .filter((r) => r.name.startsWith(`${prefix}-`))
13
+ .map((r) => {
14
+ const supervisor = sessionSupervisor(r);
15
+ return {
16
+ name: r.name,
17
+ sessionName: r.name,
18
+ pid: currentSessionPid(r),
19
+ alive: isSessionRecordAlive(r),
20
+ supervisor: supervisor.type,
21
+ ...(supervisor.type === "systemd" && supervisor.unit ? { systemdUnit: supervisor.unit } : {}),
22
+ terminalSession: r.name,
23
+ terminalAvailable: tmuxHasSession(r.name, readRunnerInfo(r)?.tmuxSocket),
24
+ logFile: r.logFile,
25
+ };
26
+ });
27
+ }
28
+
29
+ export function isSessionAlive(name: string): boolean {
30
+ const record = loadState().find((r) => r.name === name);
31
+ return record ? isSessionRecordAlive(record) : false;
32
+ }
33
+
34
+ export function refreshManagedAgentReport(agent: ManagedAgentReport): ManagedAgentReport {
35
+ const record = findSessionRecord({
36
+ tmuxSession: agent.sessionName ?? agent.tmuxSession,
37
+ agentId: agent.agentId,
38
+ policyName: agent.policyName,
39
+ spawnRequestId: agent.spawnRequestId,
40
+ });
41
+ if (!record) return agent;
42
+ return {
43
+ ...agent,
44
+ workspaceMode: record.workspaceMode,
45
+ workspace: record.workspace ?? agent.workspace ?? (record.workspaceMode ? { mode: "shared", requestedMode: record.workspaceMode } : undefined),
46
+ pid: currentSessionPid(record),
47
+ ...sessionReportFields(record),
48
+ };
49
+ }
50
+
51
+ export async function recoverExistingSessions(
52
+ config: OrchestratorConfig,
53
+ ): Promise<ManagedAgentReport[]> {
54
+ const records = loadState().filter((r) => r.name.startsWith(`${config.tmuxPrefix}-`));
55
+ const managed: ManagedAgentReport[] = [];
56
+ const alive: SessionRecord[] = [];
57
+
58
+ for (const record of records) {
59
+ if (!isSessionRecordAlive(record)) {
60
+ console.error(`[orchestrator] Stale session: ${record.name} (pid ${record.pid} dead) — removing`);
61
+ cleanupSessionRecord(record);
62
+ continue;
63
+ }
64
+
65
+ const pid = currentSessionPid(record);
66
+ const updatedRecord = { ...record, pid };
67
+ alive.push(updatedRecord);
68
+ managed.push({
69
+ agentId: record.agentId,
70
+ provider: record.provider as "claude" | "codex",
71
+ workspaceMode: record.workspaceMode,
72
+ workspace: record.workspace ?? (record.workspaceMode ? { mode: "shared", requestedMode: record.workspaceMode } : undefined),
73
+ ...sessionReportFields(updatedRecord),
74
+ cwd: record.cwd,
75
+ label: record.label,
76
+ approvalMode: record.approvalMode || "guarded",
77
+ policyName: record.policyName,
78
+ spawnRequestId: record.spawnRequestId,
79
+ automationRunId: record.automationRunId,
80
+ pid,
81
+ startedAt: record.startedAt,
82
+ });
83
+
84
+ console.error(`[orchestrator] Recovered existing session: ${record.name} (pid ${record.pid})`);
85
+ }
86
+
87
+ // Merge rather than overwrite: only replace the records this recovery actually
88
+ // inspected, so a session added concurrently (or owned by another prefix) is
89
+ // not erased by writing back a pre-filtered snapshot.
90
+ const processedNames = new Set(records.map((r) => r.name));
91
+ const untouched = loadState().filter((r) => !processedNames.has(r.name));
92
+ saveState([...untouched, ...alive]);
93
+ return managed;
94
+ }
95
+
96
+ export function managedAgentId(config: OrchestratorConfig, provider: string, label: string): string {
97
+ const cleanHost = sanitizeFsName(config.hostname, { replacement: "-", lowercase: true });
98
+ const cleanLabel = sanitizeFsName(label, { replacement: "-", lowercase: true });
99
+ return `${cleanHost}-${provider}-${cleanLabel}-${crypto.randomUUID().slice(0, 8)}`;
100
+ }
101
+
102
+ // Shared shell-quoting; re-exported so `./spawn` consumers + tests resolve it.
103
+ export { shellEscape };