agent-relay-orchestrator 0.62.2 → 0.63.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,94 @@
1
+ import { closeSync, existsSync, openSync, rmSync } from "node:fs";
2
+ import type { OrchestratorConfig } from "../config";
3
+ import { resolveSpawnWorkspace, workspacesRoot } from "../workspace-probe";
4
+ import type { ManagedAgentReport } from "../relay";
5
+ import { buildEnv, buildRunnerCommand, defaultSpawnLabel, isWithinBaseDir, sessionName } from "./command";
6
+ import { addSessionRecord, ensureLogDir, ensureRunnerInfoDir, logFilePath, runnerInfoPath, sessionReportFields } from "./runtime";
7
+ import { managedAgentId } from "./sessions";
8
+ import { spawnRunner } from "./supervisor";
9
+ import type { SpawnOptions } from "./types";
10
+
11
+ export async function spawnAgent(
12
+ opts: SpawnOptions,
13
+ config: OrchestratorConfig,
14
+ ): Promise<ManagedAgentReport> {
15
+ const label = opts.label || defaultSpawnLabel();
16
+ const agentId = opts.agentId || managedAgentId(config, opts.provider, label);
17
+ const name = sessionName(config, opts.provider, label, opts.spawnRequestId ?? agentId);
18
+
19
+ if (!existsSync(opts.cwd)) {
20
+ throw new Error(`cwd does not exist: ${opts.cwd}`);
21
+ }
22
+ if (!isWithinBaseDir(opts.cwd, config.baseDir)) {
23
+ throw new Error(`cwd must be within base directory: ${config.baseDir}`);
24
+ }
25
+
26
+ const resolvedWorkspace = await resolveSpawnWorkspace({
27
+ ...opts,
28
+ label,
29
+ workspaceSymlinks: opts.workspaceSymlinks,
30
+ workspaceRoot: workspacesRoot(config.baseDir),
31
+ });
32
+ const spawnOpts = { ...opts, label, agentId, cwd: resolvedWorkspace.cwd, workspace: resolvedWorkspace.workspace };
33
+
34
+ const command = buildRunnerCommand(spawnOpts, config);
35
+
36
+ ensureLogDir();
37
+ ensureRunnerInfoDir();
38
+ const logFile = logFilePath(name);
39
+ const runnerInfoFile = runnerInfoPath(name);
40
+ rmSync(runnerInfoFile, { force: true });
41
+ const env = buildEnv({ ...spawnOpts, env: { ...(spawnOpts.env ?? {}), AGENT_RELAY_RUNNER_INFO_FILE: runnerInfoFile } }, config, logFile, name);
42
+ const logFd = openSync(logFile, "w");
43
+
44
+ console.error(`[orchestrator] Spawning ${opts.provider} agent: ${name}`);
45
+ console.error(`[orchestrator] cwd: ${opts.cwd}`);
46
+ console.error(`[orchestrator] command: ${command.join(" ")}`);
47
+ console.error(`[orchestrator] log: ${logFile}`);
48
+
49
+ closeSync(logFd);
50
+
51
+ const runner = spawnRunner(name, command, spawnOpts.cwd, env, logFile);
52
+
53
+ addSessionRecord({
54
+ name,
55
+ pid: runner.pid,
56
+ supervisor: runner.supervisor,
57
+ provider: spawnOpts.provider,
58
+ model: spawnOpts.model,
59
+ effort: spawnOpts.effort,
60
+ profile: spawnOpts.profile,
61
+ workspaceMode: spawnOpts.workspaceMode, lifecycle: spawnOpts.lifecycle ?? "persistent",
62
+ workspace: spawnOpts.workspace,
63
+ label,
64
+ cwd: spawnOpts.cwd,
65
+ logFile,
66
+ runnerInfoFile,
67
+ agentId,
68
+ approvalMode: spawnOpts.approvalMode,
69
+ policyName: spawnOpts.policyName,
70
+ spawnRequestId: spawnOpts.spawnRequestId,
71
+ automationId: spawnOpts.automationId,
72
+ automationRunId: spawnOpts.automationRunId,
73
+ startedAt: Date.now(),
74
+ });
75
+
76
+ return {
77
+ agentId,
78
+ provider: spawnOpts.provider,
79
+ model: spawnOpts.model,
80
+ effort: spawnOpts.effort,
81
+ profile: spawnOpts.profile,
82
+ workspaceMode: spawnOpts.workspaceMode, lifecycle: spawnOpts.lifecycle ?? "persistent",
83
+ workspace: spawnOpts.workspace,
84
+ ...sessionReportFields({ name, supervisor: runner.supervisor, runnerInfoFile, agentId, provider: spawnOpts.provider }),
85
+ cwd: spawnOpts.cwd,
86
+ label,
87
+ approvalMode: spawnOpts.approvalMode || "guarded",
88
+ policyName: spawnOpts.policyName,
89
+ spawnRequestId: spawnOpts.spawnRequestId,
90
+ automationRunId: spawnOpts.automationRunId,
91
+ pid: runner.pid,
92
+ startedAt: Date.now(),
93
+ };
94
+ }
@@ -0,0 +1,347 @@
1
+ import { chmodSync, closeSync, existsSync, openSync, readFileSync, rmSync, statSync, writeFileSync } from "node:fs";
2
+ import { join } from "node:path";
3
+ import type { OrchestratorConfig } from "../config";
4
+ import type { ManagedSessionExitDiagnostics } from "../relay";
5
+ import { errMessage, extractClaudeModelUnavailableMessage } from "agent-relay-sdk";
6
+ import { isPidAlive } from "agent-relay-sdk/process-utils";
7
+ import { shellEscape } from "agent-relay-sdk/shell-utils";
8
+ import { tmuxHasSession } from "agent-relay-sdk/tmux-utils";
9
+ import { sanitizeFsName } from "agent-relay-sdk/fs-name";
10
+ import { SESSION_DIR } from "./constants";
11
+ import { logLines } from "./log-utils";
12
+ import { currentSessionPid, ensureSessionDir, findSessionRecord, isSessionRecordAlive, loadState, logFilePath, readRunnerInfo, removeSessionRecord, sessionSupervisor } from "./runtime";
13
+ import { systemdMainPid, systemdUnitName } from "./systemd";
14
+ import type { SessionRecord, SessionSupervisor, SpawnedRunner } from "./types";
15
+
16
+ export function spawnRunner(name: string, command: string[], cwd: string, env: Record<string, string>, logFile: string): SpawnedRunner {
17
+ if (shouldUseSystemdSupervisor()) {
18
+ try {
19
+ return spawnSystemdRunner(name, command, cwd, env, logFile);
20
+ } catch (error) {
21
+ console.error(`[orchestrator] systemd runner supervisor unavailable for ${name}: ${errMessage(error)}`);
22
+ console.error("[orchestrator] Falling back to process child; this agent will not survive orchestrator service restart.");
23
+ }
24
+ }
25
+
26
+ const launchScript = launchScriptPath(name);
27
+ ensureSessionDir();
28
+ writeFileSync(launchScript, buildLaunchScript(command, cwd, env), { mode: 0o700 });
29
+ chmodSync(launchScript, 0o700);
30
+
31
+ const logFd = openSync(logFile, "a");
32
+ try {
33
+ const proc = Bun.spawn([launchScript], {
34
+ cwd,
35
+ env,
36
+ stdin: "ignore",
37
+ stdout: logFd,
38
+ stderr: logFd,
39
+ });
40
+ return { pid: proc.pid, supervisor: { type: "process", launchScript } };
41
+ } finally {
42
+ closeSync(logFd);
43
+ }
44
+ }
45
+
46
+ function shouldUseSystemdSupervisor(): boolean {
47
+ if (process.platform !== "linux") return false;
48
+ if (process.env.AGENT_RELAY_DISABLE_SYSTEMD_SUPERVISOR === "1") return false;
49
+ if (process.env.AGENT_RELAY_FORCE_SYSTEMD_SUPERVISOR === "1") return true;
50
+ const result = Bun.spawnSync(["systemctl", "--user", "show-environment"], {
51
+ stdin: "ignore",
52
+ stdout: "ignore",
53
+ stderr: "ignore",
54
+ });
55
+ return result.exitCode === 0;
56
+ }
57
+
58
+ function spawnSystemdRunner(name: string, command: string[], cwd: string, env: Record<string, string>, logFile: string): SpawnedRunner {
59
+ const unit = systemdUnitName(name);
60
+ const launchScript = launchScriptPath(name);
61
+ ensureSessionDir();
62
+ writeFileSync(launchScript, buildLaunchScript(command, cwd, env), { mode: 0o700 });
63
+ chmodSync(launchScript, 0o700);
64
+
65
+ Bun.spawnSync(["systemctl", "--user", "stop", `${unit}.service`], {
66
+ stdin: "ignore",
67
+ stdout: "ignore",
68
+ stderr: "ignore",
69
+ });
70
+
71
+ const result = Bun.spawnSync([
72
+ "systemd-run",
73
+ "--user",
74
+ `--unit=${unit}`,
75
+ "--collect",
76
+ "--property=KillMode=control-group",
77
+ `--property=StandardOutput=append:${logFile}`,
78
+ `--property=StandardError=append:${logFile}`,
79
+ launchScript,
80
+ ], {
81
+ stdin: "ignore",
82
+ stdout: "pipe",
83
+ stderr: "pipe",
84
+ });
85
+ if (result.exitCode !== 0) {
86
+ const stderr = result.stderr.toString().trim();
87
+ throw new Error(stderr || `systemd-run failed with exit code ${result.exitCode}`);
88
+ }
89
+
90
+ const pid = waitForSystemdMainPid(unit, 2_000);
91
+ if (!pid) throw new Error(`systemd unit ${unit}.service started without a MainPID`);
92
+ return { pid, supervisor: { type: "systemd", unit, launchScript } };
93
+ }
94
+
95
+ function launchScriptPath(session: string): string {
96
+ const safe = sanitizeFsName(session, { replacement: "-", trimEdge: true, fallback: "agent" });
97
+ return join(SESSION_DIR, `${safe}.sh`);
98
+ }
99
+
100
+ export function buildLaunchScript(command: string[], cwd: string, env: Record<string, string>): string {
101
+ const exports = Object.entries(env)
102
+ .filter(([key, value]) => /^[A-Za-z_][A-Za-z0-9_]*$/.test(key) && value !== undefined)
103
+ .sort(([a], [b]) => a.localeCompare(b))
104
+ .map(([key, value]) => `export ${key}=${shellEscape(String(value))}`);
105
+ return [
106
+ "#!/usr/bin/env bash",
107
+ "set -euo pipefail",
108
+ ...exports,
109
+ `cd ${shellEscape(cwd)}`,
110
+ `exec ${command.map(shellEscape).join(" ")}`,
111
+ "",
112
+ ].join("\n");
113
+ }
114
+
115
+ function waitForSystemdMainPid(unit: string, timeoutMs: number): number {
116
+ const deadline = Date.now() + timeoutMs;
117
+ while (Date.now() < deadline) {
118
+ const pid = systemdMainPid(unit);
119
+ if (pid > 0 && isPidAlive(pid)) return pid;
120
+ Bun.sleepSync(50);
121
+ }
122
+ return 0;
123
+ }
124
+
125
+ function systemdUnitDiagnostics(unit: string): NonNullable<ManagedSessionExitDiagnostics["systemd"]> {
126
+ const result = Bun.spawnSync([
127
+ "systemctl", "--user", "show", `${unit}.service`,
128
+ "-p", "ActiveState",
129
+ "-p", "SubState",
130
+ "-p", "Result",
131
+ "-p", "ExecMainCode",
132
+ "-p", "ExecMainStatus",
133
+ "-p", "MainPID",
134
+ ], {
135
+ stdin: "ignore",
136
+ stdout: "pipe",
137
+ stderr: "pipe",
138
+ });
139
+ if (result.exitCode !== 0) {
140
+ return {
141
+ unit,
142
+ unavailable: result.stderr.toString().trim() || `systemctl show exited with ${result.exitCode}`,
143
+ };
144
+ }
145
+ const props = new Map<string, string>();
146
+ for (const line of result.stdout.toString().split("\n")) {
147
+ const index = line.indexOf("=");
148
+ if (index <= 0) continue;
149
+ props.set(line.slice(0, index), line.slice(index + 1));
150
+ }
151
+ const mainPid = Number(props.get("MainPID"));
152
+ return {
153
+ unit,
154
+ activeState: props.get("ActiveState") || undefined,
155
+ subState: props.get("SubState") || undefined,
156
+ result: props.get("Result") || undefined,
157
+ execMainCode: props.get("ExecMainCode") || undefined,
158
+ execMainStatus: props.get("ExecMainStatus") || undefined,
159
+ mainPid: Number.isFinite(mainPid) && mainPid > 0 ? mainPid : undefined,
160
+ };
161
+ }
162
+
163
+ function logFileDiagnostics(logFile: string): Pick<ManagedSessionExitDiagnostics, "logBytes" | "logEmpty" | "logTail"> & { logUnavailable?: string } {
164
+ try {
165
+ const stat = statSync(logFile);
166
+ if (stat.size === 0) return { logBytes: 0, logEmpty: true, logTail: [] };
167
+ const content = readFileSync(logFile, "utf8");
168
+ return {
169
+ logBytes: stat.size,
170
+ logEmpty: false,
171
+ logTail: logLines(content).slice(-20),
172
+ };
173
+ } catch (error) {
174
+ return {
175
+ logUnavailable: errMessage(error),
176
+ };
177
+ }
178
+ }
179
+
180
+ function describeSessionExit(record: SessionRecord, diagnostics: Omit<ManagedSessionExitDiagnostics, "lastError">): string {
181
+ if (record.provider === "claude") {
182
+ const modelUnavailable = extractClaudeModelUnavailableMessage((diagnostics.logTail ?? []).join("\n"));
183
+ if (modelUnavailable) return modelUnavailable;
184
+ }
185
+ const seconds = Math.max(0, Math.round(diagnostics.runtimeMs / 1000));
186
+ const parts = [`managed ${record.provider} session ${record.name} exited after ${seconds}s`];
187
+ if (diagnostics.systemd?.unavailable) {
188
+ parts.push(`systemd status unavailable: ${diagnostics.systemd.unavailable}`);
189
+ } else if (diagnostics.systemd) {
190
+ const state = [diagnostics.systemd.activeState, diagnostics.systemd.subState].filter(Boolean).join("/") || "unknown";
191
+ const result = diagnostics.systemd.result || "unknown";
192
+ const exit = [diagnostics.systemd.execMainCode, diagnostics.systemd.execMainStatus].filter(Boolean).join("/") || "unknown";
193
+ parts.push(`systemd ${diagnostics.systemd.unit}.service state=${state} result=${result} exit=${exit}`);
194
+ }
195
+ if (diagnostics.logEmpty) {
196
+ parts.push("stdout/stderr log is empty");
197
+ } else if (diagnostics.logBytes === undefined) {
198
+ parts.push("stdout/stderr log unavailable");
199
+ }
200
+ if (!diagnostics.runnerInfoPresent) parts.push("runner info was not written");
201
+ return parts.join("; ");
202
+ }
203
+
204
+ export function diagnoseSessionExit(input: { agentId?: string; policyName?: string; spawnRequestId?: string; tmuxSession?: string }): ManagedSessionExitDiagnostics | null {
205
+ const record = findSessionRecord(input);
206
+ if (!record) return null;
207
+ const detectedAt = Date.now();
208
+ const supervisor = sessionSupervisor(record);
209
+ const currentPid = currentSessionPid(record);
210
+ const terminalAvailable = tmuxHasSession(record.name, readRunnerInfo(record)?.tmuxSocket);
211
+ const log = logFileDiagnostics(record.logFile);
212
+ const runnerInfoPresent = record.runnerInfoFile ? existsSync(record.runnerInfoFile) : false;
213
+ const unavailable = [
214
+ ...(log.logUnavailable ? [`stdout/stderr log unavailable: ${log.logUnavailable}`] : []),
215
+ ...(log.logEmpty ? ["stdout/stderr log empty"] : []),
216
+ ...(!runnerInfoPresent ? ["runner info unavailable"] : []),
217
+ ];
218
+ const base: Omit<ManagedSessionExitDiagnostics, "lastError"> = {
219
+ agentId: record.agentId,
220
+ provider: record.provider as "claude" | "codex",
221
+ workspaceMode: record.workspaceMode,
222
+ workspace: record.workspace ?? (record.workspaceMode ? { mode: "shared", requestedMode: record.workspaceMode } : undefined),
223
+ sessionName: record.name,
224
+ tmuxSession: record.name,
225
+ cwd: record.cwd,
226
+ label: record.label,
227
+ policyName: record.policyName,
228
+ spawnRequestId: record.spawnRequestId,
229
+ automationRunId: record.automationRunId,
230
+ supervisor: supervisor.type,
231
+ ...(supervisor.type === "systemd" && supervisor.unit ? { systemdUnit: supervisor.unit } : {}),
232
+ terminalSession: record.name,
233
+ terminalAvailable,
234
+ pid: record.pid,
235
+ currentPid,
236
+ startedAt: record.startedAt,
237
+ detectedAt,
238
+ runtimeMs: Math.max(0, detectedAt - record.startedAt),
239
+ logFile: record.logFile,
240
+ logBytes: log.logBytes,
241
+ logEmpty: log.logEmpty,
242
+ logTail: log.logTail,
243
+ runnerInfoFile: record.runnerInfoFile,
244
+ runnerInfoPresent,
245
+ ...(supervisor.type === "systemd" && supervisor.unit ? { systemd: systemdUnitDiagnostics(supervisor.unit) } : {}),
246
+ ...(unavailable.length ? { unavailable } : {}),
247
+ };
248
+ return {
249
+ ...base,
250
+ lastError: describeSessionExit(record, base),
251
+ };
252
+ }
253
+
254
+ function stopSystemdUnit(unit: string): void {
255
+ Bun.spawnSync(["systemctl", "--user", "stop", `${unit}.service`], {
256
+ stdin: "ignore",
257
+ stdout: "ignore",
258
+ stderr: "ignore",
259
+ });
260
+ }
261
+
262
+ function killSystemdUnit(unit: string): void {
263
+ Bun.spawnSync(["systemctl", "--user", "kill", "--kill-whom=all", "--signal=SIGKILL", `${unit}.service`], {
264
+ stdin: "ignore",
265
+ stdout: "ignore",
266
+ stderr: "ignore",
267
+ });
268
+ }
269
+
270
+ function cleanupSupervisor(supervisor: SessionSupervisor): void {
271
+ if (supervisor.type === "systemd" && supervisor.unit) stopSystemdUnit(supervisor.unit);
272
+ if (supervisor.launchScript) rmSync(supervisor.launchScript, { force: true });
273
+ }
274
+
275
+ export function cleanupSessionRecord(record: SessionRecord): void {
276
+ cleanupSupervisor(sessionSupervisor(record));
277
+ if (record.runnerInfoFile) rmSync(record.runnerInfoFile, { force: true });
278
+ }
279
+
280
+ export async function stopSession(name: string, config: OrchestratorConfig, reason: string, graceful = true, timeoutMs?: number): Promise<{ stopped: boolean; wasRunning: boolean }> {
281
+ if (!name.startsWith(`${config.tmuxPrefix}-`)) throw new Error("session is not managed by this orchestrator");
282
+
283
+ const records = loadState();
284
+ const record = records.find((r) => r.name === name);
285
+ if (!record || !isSessionRecordAlive(record)) {
286
+ if (record) cleanupSessionRecord(record);
287
+ removeSessionRecord(name);
288
+ return { stopped: false, wasRunning: false };
289
+ }
290
+
291
+ const pid = currentSessionPid(record);
292
+ console.error(`[orchestrator] Stopping session ${name} (pid ${pid}): ${reason}`);
293
+
294
+ const supervisor = sessionSupervisor(record);
295
+ const gracefulTimeoutMs = sessionStopTimeoutMs(graceful, timeoutMs);
296
+ if (supervisor.type === "systemd" && supervisor.unit) {
297
+ stopSystemdUnit(supervisor.unit);
298
+ const deadline = Date.now() + gracefulTimeoutMs;
299
+ while (Date.now() < deadline && isSessionRecordAlive(record)) {
300
+ await Bun.sleep(200);
301
+ }
302
+ if (isSessionRecordAlive(record)) {
303
+ killSystemdUnit(supervisor.unit);
304
+ const killDeadline = Date.now() + 2_000;
305
+ while (Date.now() < killDeadline && isSessionRecordAlive(record)) {
306
+ await Bun.sleep(100);
307
+ }
308
+ }
309
+ if (isSessionRecordAlive(record)) return { stopped: false, wasRunning: true };
310
+ cleanupSessionRecord(record);
311
+ removeSessionRecord(name);
312
+ return { stopped: true, wasRunning: true };
313
+ }
314
+
315
+ if (graceful) {
316
+ try { process.kill(pid, "SIGTERM"); } catch {}
317
+ const deadline = Date.now() + gracefulTimeoutMs;
318
+ while (Date.now() < deadline && isPidAlive(pid)) {
319
+ await Bun.sleep(200);
320
+ }
321
+ }
322
+
323
+ if (isPidAlive(pid)) {
324
+ try { process.kill(pid, "SIGKILL"); } catch {}
325
+ const deadline = Date.now() + 2_000;
326
+ while (Date.now() < deadline && isPidAlive(pid)) {
327
+ await Bun.sleep(100);
328
+ }
329
+ }
330
+
331
+ // Never report success while the process is still alive: deleting the session
332
+ // record here would orphan a running process with no handle to stop it again.
333
+ if (isPidAlive(pid)) {
334
+ console.error(`[orchestrator] Session ${name} (pid ${pid}) survived SIGKILL; keeping record for retry`);
335
+ return { stopped: false, wasRunning: true };
336
+ }
337
+
338
+ cleanupSessionRecord(record);
339
+ removeSessionRecord(name);
340
+ return { stopped: true, wasRunning: true };
341
+ }
342
+
343
+ function sessionStopTimeoutMs(graceful: boolean, timeoutMs?: number): number {
344
+ if (!graceful) return 2_000;
345
+ if (!Number.isSafeInteger(timeoutMs) || !timeoutMs || timeoutMs <= 0) return 10_000;
346
+ return Math.min(timeoutMs, 60_000);
347
+ }
@@ -0,0 +1,17 @@
1
+ import { sanitizeFsName } from "agent-relay-sdk/fs-name";
2
+
3
+ export function systemdUnitName(session: string): string {
4
+ const safe = sanitizeFsName(session, { replacement: "-", trimEdge: true, fallback: "agent" });
5
+ return `agent-relay-managed-${safe}`.slice(0, 180);
6
+ }
7
+
8
+ export function systemdMainPid(unit: string): number {
9
+ const result = Bun.spawnSync(["systemctl", "--user", "show", `${unit}.service`, "-p", "MainPID", "--value"], {
10
+ stdin: "ignore",
11
+ stdout: "pipe",
12
+ stderr: "ignore",
13
+ });
14
+ if (result.exitCode !== 0) return 0;
15
+ const pid = Number(result.stdout.toString().trim());
16
+ return Number.isFinite(pid) ? pid : 0;
17
+ }