agent-relay-orchestrator 0.119.11 → 0.120.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/config.ts +2 -0
- package/src/control.ts +84 -2
- package/src/process.ts +156 -17
- package/src/shared-callmux.ts +18 -4
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-relay-orchestrator",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.120.0",
|
|
4
4
|
"description": "Agent Relay orchestrator — manages agent lifecycle across hosts",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
},
|
|
18
18
|
"dependencies": {
|
|
19
19
|
"agent-relay-providers": "0.104.3",
|
|
20
|
-
"agent-relay-sdk": "0.2.
|
|
20
|
+
"agent-relay-sdk": "0.2.107",
|
|
21
21
|
"callmux": "0.23.0"
|
|
22
22
|
},
|
|
23
23
|
"devDependencies": {
|
package/src/config.ts
CHANGED
|
@@ -93,6 +93,8 @@ export const TERMINAL_COMMAND_TIMEOUT_MS = envNonNegativeMax("AGENT_RELAY_TERMIN
|
|
|
93
93
|
export const TERMINAL_GROUND_WAIT_MAX_MS = envNonNegativeMax("AGENT_RELAY_TERMINAL_GROUND_WAIT_MAX_MS", 500, 0);
|
|
94
94
|
export const TERMINAL_BACKFILL_SCROLLBACK_LINES = envNonNegativeMax("AGENT_RELAY_TERMINAL_BACKFILL_SCROLLBACK", 1000, 0);
|
|
95
95
|
export const TERMINAL_DEBUG = process.env.AGENT_RELAY_TERMINAL_DEBUG === "1";
|
|
96
|
+
export const SCHEDULER_COMMAND_MAX_CONCURRENT = envNonNegativeMax("AGENT_RELAY_SCHEDULER_COMMAND_MAX_CONCURRENT", 4, 1);
|
|
97
|
+
export const SCHEDULER_COMMAND_MAX_OUTPUT_BYTES = envNonNegativeMax("AGENT_RELAY_SCHEDULER_COMMAND_MAX_OUTPUT_BYTES", 256 * 1024, 4096);
|
|
96
98
|
|
|
97
99
|
interface RawConfig {
|
|
98
100
|
id?: string;
|
package/src/control.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { errMessage, isRecord, normalizeAgentLifecycle, normalizeWorkspaceMode, Semaphore } from "agent-relay-sdk";
|
|
1
|
+
import { errMessage, isPathWithinBase, isRecord, normalizeAgentLifecycle, normalizeWorkspaceMode, Semaphore } from "agent-relay-sdk";
|
|
2
2
|
import { getAllManifests, getManifest } from "agent-relay-providers";
|
|
3
|
-
import type
|
|
3
|
+
import { SCHEDULER_COMMAND_MAX_CONCURRENT, SCHEDULER_COMMAND_MAX_OUTPUT_BYTES, type OrchestratorConfig } from "./config";
|
|
4
4
|
import type { ManagedAgentReport, RelayClient, RelayCommand } from "./relay";
|
|
5
5
|
import { handleSelfUpgrade } from "./self-upgrade";
|
|
6
6
|
import { readLocalProviderConfigs } from "./provider-config-migration";
|
|
@@ -9,6 +9,7 @@ import { cleanupWorkspace, discardRecoveryBranch, idleRefreshWorktree, mergeWork
|
|
|
9
9
|
import { withMergePhaseTimeout } from "./workspace-probe/merge-timeouts";
|
|
10
10
|
import { armWorkspacePrAutoMerge, mergeWorkspacePr, refreshWorkspacePrBranch } from "./workspace-pr";
|
|
11
11
|
import type { WorkspaceMergeResult } from "agent-relay-sdk";
|
|
12
|
+
import { execProcess } from "./process";
|
|
12
13
|
|
|
13
14
|
// #638 — settle a `workspace.merge` command on whether it made progress, not on whether
|
|
14
15
|
// the merge function threw. Every no-progress outcome (origin moved ahead, an unpredicted
|
|
@@ -64,6 +65,22 @@ export function resolveSpawnRegistrationTimeoutMs(env: NodeJS.ProcessEnv = proce
|
|
|
64
65
|
return parsed;
|
|
65
66
|
}
|
|
66
67
|
|
|
68
|
+
export function resolveMaxConcurrentSchedulerCommands(env: NodeJS.ProcessEnv = process.env): number {
|
|
69
|
+
const raw = env.AGENT_RELAY_SCHEDULER_COMMAND_MAX_CONCURRENT;
|
|
70
|
+
if (raw === undefined || raw.trim() === "") return SCHEDULER_COMMAND_MAX_CONCURRENT;
|
|
71
|
+
const parsed = Number(raw);
|
|
72
|
+
if (!Number.isInteger(parsed) || parsed < 1) return SCHEDULER_COMMAND_MAX_CONCURRENT;
|
|
73
|
+
return parsed;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
export function resolveSchedulerCommandMaxOutputBytes(env: NodeJS.ProcessEnv = process.env): number {
|
|
77
|
+
const raw = env.AGENT_RELAY_SCHEDULER_COMMAND_MAX_OUTPUT_BYTES;
|
|
78
|
+
if (raw === undefined || raw.trim() === "") return SCHEDULER_COMMAND_MAX_OUTPUT_BYTES;
|
|
79
|
+
const parsed = Number(raw);
|
|
80
|
+
if (!Number.isSafeInteger(parsed) || parsed < 4096) return SCHEDULER_COMMAND_MAX_OUTPUT_BYTES;
|
|
81
|
+
return parsed;
|
|
82
|
+
}
|
|
83
|
+
|
|
67
84
|
/** Awaits a spawn slot until the child registers, or resolves false at the fail-safe deadline. */
|
|
68
85
|
export type WaitForRegistration = (agent: ManagedAgentReport, timeoutMs: number) => Promise<boolean>;
|
|
69
86
|
|
|
@@ -98,6 +115,8 @@ interface ControlHandlerDeps {
|
|
|
98
115
|
registrationTimeoutMs?: number;
|
|
99
116
|
// Injectable stop dispatch — production uses tmux/system process teardown.
|
|
100
117
|
stopSession?: typeof stopSession;
|
|
118
|
+
maxConcurrentSchedulerCommands?: number;
|
|
119
|
+
schedulerCommandMaxOutputBytes?: number;
|
|
101
120
|
}
|
|
102
121
|
|
|
103
122
|
export function createControlHandler(
|
|
@@ -113,10 +132,13 @@ export function createControlHandler(
|
|
|
113
132
|
const spawnSlots = new Semaphore(deps.maxConcurrentSpawns ?? resolveMaxConcurrentSpawns());
|
|
114
133
|
const waitForRegistration = deps.waitForRegistration ?? waitForManagedRegistration;
|
|
115
134
|
const registrationTimeoutMs = deps.registrationTimeoutMs ?? resolveSpawnRegistrationTimeoutMs();
|
|
135
|
+
const schedulerCommandSlots = new Semaphore(deps.maxConcurrentSchedulerCommands ?? resolveMaxConcurrentSchedulerCommands());
|
|
136
|
+
const schedulerCommandMaxOutputBytes = deps.schedulerCommandMaxOutputBytes ?? resolveSchedulerCommandMaxOutputBytes();
|
|
116
137
|
// Backgrounded spawn tasks: each is claimed synchronously (status → accepted)
|
|
117
138
|
// then runs to completion off the poll tick, holding a slot across the child's
|
|
118
139
|
// registration window. Tracked so tests/shutdown can observe and settle them.
|
|
119
140
|
const backgroundSpawns = new Set<Promise<void>>();
|
|
141
|
+
const backgroundCommands = new Set<Promise<void>>();
|
|
120
142
|
|
|
121
143
|
async function spawnManagedAgent(opts: SpawnOptions, action = "Spawned"): Promise<ManagedAgentReport> {
|
|
122
144
|
const agent = await dispatchSpawn(opts, config);
|
|
@@ -272,9 +294,69 @@ export function createControlHandler(
|
|
|
272
294
|
// per-host cap (see dispatchSpawnCommand). Every other command is handled
|
|
273
295
|
// inline, synchronously with the poll tick, exactly as before.
|
|
274
296
|
if (command.type === "agent.spawn") return dispatchSpawnCommand(command);
|
|
297
|
+
if (command.type === "scheduler.command") return dispatchSchedulerCommand(command);
|
|
275
298
|
return handleNonSpawnCommand(command);
|
|
276
299
|
}
|
|
277
300
|
|
|
301
|
+
async function dispatchSchedulerCommand(command: RelayCommand): Promise<boolean> {
|
|
302
|
+
await relay.updateCommand(command.id, "accepted");
|
|
303
|
+
const startedAt = Date.now();
|
|
304
|
+
const run = (async () => {
|
|
305
|
+
await schedulerCommandSlots.acquire();
|
|
306
|
+
try {
|
|
307
|
+
await relay.updateCommand(command.id, "running");
|
|
308
|
+
const commandText = typeof command.params.command === "string" ? command.params.command : "";
|
|
309
|
+
const cwd = typeof command.params.cwd === "string" ? command.params.cwd : config.baseDir;
|
|
310
|
+
if (!commandText.trim()) throw new Error("scheduler.command missing command");
|
|
311
|
+
if (!isPathWithinBase(cwd, config.baseDir)) throw new Error("scheduler.command cwd escapes orchestrator baseDir");
|
|
312
|
+
const timeoutMs = typeof command.params.timeoutMs === "number" && Number.isSafeInteger(command.params.timeoutMs) && command.params.timeoutMs > 0
|
|
313
|
+
? command.params.timeoutMs
|
|
314
|
+
: undefined;
|
|
315
|
+
const env = isRecord(command.params.env)
|
|
316
|
+
? Object.fromEntries(Object.entries(command.params.env).filter((entry): entry is [string, string] => typeof entry[1] === "string"))
|
|
317
|
+
: undefined;
|
|
318
|
+
const result = await execProcess(["bash", "-lc", commandText], {
|
|
319
|
+
cwd,
|
|
320
|
+
env: { ...process.env, ...(env ?? {}) },
|
|
321
|
+
timeoutMs,
|
|
322
|
+
timeoutLabel: `scheduler command ${command.id}`,
|
|
323
|
+
trimStdout: false,
|
|
324
|
+
trimStderr: false,
|
|
325
|
+
maxOutputBytes: schedulerCommandMaxOutputBytes,
|
|
326
|
+
// #968 hole 2 — reap the whole tree on every exit path so a backgrounded/daemonized
|
|
327
|
+
// child (e.g. `setsid … &`) can't survive after the command returns.
|
|
328
|
+
reapProcessGroup: true,
|
|
329
|
+
});
|
|
330
|
+
const finishedAt = Date.now();
|
|
331
|
+
await relay.updateCommand(command.id, result.ok ? "succeeded" : result.timedOut ? "timed_out" : "failed", {
|
|
332
|
+
command: commandText,
|
|
333
|
+
cwd,
|
|
334
|
+
startedAt,
|
|
335
|
+
finishedAt,
|
|
336
|
+
durationMs: finishedAt - startedAt,
|
|
337
|
+
exitCode: result.exitCode,
|
|
338
|
+
success: result.ok,
|
|
339
|
+
timedOut: result.timedOut === true,
|
|
340
|
+
stdoutTail: result.stdout,
|
|
341
|
+
stderrTail: result.stderr,
|
|
342
|
+
outputTruncated: result.outputTruncated === true,
|
|
343
|
+
outputLimitExceeded: result.outputLimitExceeded === true,
|
|
344
|
+
}, result.ok ? undefined : result.stderr || `command exited ${result.exitCode ?? "without code"}`);
|
|
345
|
+
} catch (error) {
|
|
346
|
+
await relay.updateCommand(command.id, "failed", {
|
|
347
|
+
startedAt,
|
|
348
|
+
finishedAt: Date.now(),
|
|
349
|
+
durationMs: Date.now() - startedAt,
|
|
350
|
+
}, errMessage(error));
|
|
351
|
+
} finally {
|
|
352
|
+
schedulerCommandSlots.release();
|
|
353
|
+
}
|
|
354
|
+
})();
|
|
355
|
+
backgroundCommands.add(run);
|
|
356
|
+
run.finally(() => backgroundCommands.delete(run));
|
|
357
|
+
return true;
|
|
358
|
+
}
|
|
359
|
+
|
|
278
360
|
async function handleNonSpawnCommand(command: RelayCommand): Promise<boolean> {
|
|
279
361
|
await relay.updateCommand(command.id, "accepted");
|
|
280
362
|
await relay.updateCommand(command.id, "running");
|
package/src/process.ts
CHANGED
|
@@ -1,9 +1,14 @@
|
|
|
1
|
+
import { mkdirSync, readFileSync, rmdirSync, writeFileSync } from "node:fs";
|
|
2
|
+
import { isPidAlive, killPid, processTreePids, waitForPidsExit } from "agent-relay-sdk/process-utils";
|
|
3
|
+
|
|
1
4
|
export interface ExecResult {
|
|
2
5
|
ok: boolean;
|
|
3
6
|
exitCode: number | null;
|
|
4
7
|
stdout: string;
|
|
5
8
|
stderr: string;
|
|
6
9
|
timedOut?: boolean;
|
|
10
|
+
outputTruncated?: boolean;
|
|
11
|
+
outputLimitExceeded?: boolean;
|
|
7
12
|
}
|
|
8
13
|
|
|
9
14
|
interface ExecOptions {
|
|
@@ -16,31 +21,117 @@ interface ExecOptions {
|
|
|
16
21
|
timeoutMs?: number;
|
|
17
22
|
timeoutLabel?: string;
|
|
18
23
|
streamDrainGraceMs?: number;
|
|
24
|
+
maxOutputBytes?: number;
|
|
19
25
|
signal?: AbortSignal;
|
|
26
|
+
/** Reap ALL descendants on EVERY exit path — normal exit included — not just on timeout/abort
|
|
27
|
+
* (#968 hole 2). Set for scheduler commands, which may background/daemonize children that would
|
|
28
|
+
* otherwise outlive the parent shell. On Linux this confines the command to a dedicated cgroup v2
|
|
29
|
+
* and kills the whole cgroup (bulletproof: a `setsid`-detached child cannot escape cgroup
|
|
30
|
+
* membership); elsewhere it falls back to a detached process-group + descendant-tree kill. */
|
|
31
|
+
reapProcessGroup?: boolean;
|
|
20
32
|
}
|
|
21
33
|
|
|
22
34
|
const DEFAULT_STREAM_DRAIN_GRACE_MS = 1_000;
|
|
35
|
+
const DEFAULT_KILL_GRACE_MS = 1_000;
|
|
36
|
+
|
|
37
|
+
let commandCgroupSeq = 0;
|
|
38
|
+
|
|
39
|
+
/** Create a dedicated child cgroup (v2) for a command so its entire process tree — including any
|
|
40
|
+
* `setsid`-detached grandchildren that reparent away and escape the process group — can be reaped
|
|
41
|
+
* in one shot via `cgroup.kill`. Returns the new cgroup's absolute path, or undefined when cgroup v2
|
|
42
|
+
* isn't available/writable (non-Linux, cgroup v1, or missing delegation) so callers fall back. */
|
|
43
|
+
function createCommandCgroup(): string | undefined {
|
|
44
|
+
if (process.platform !== "linux") return undefined;
|
|
45
|
+
try {
|
|
46
|
+
const self = readFileSync("/proc/self/cgroup", "utf8").trim();
|
|
47
|
+
const idx = self.indexOf("::"); // cgroup v2 line: "0::/rel/path"
|
|
48
|
+
if (idx < 0) return undefined;
|
|
49
|
+
const rel = self.slice(idx + 2).trim();
|
|
50
|
+
const path = `/sys/fs/cgroup${rel}/arcmd-${process.pid}-${Date.now()}-${commandCgroupSeq++}`;
|
|
51
|
+
mkdirSync(path);
|
|
52
|
+
return path;
|
|
53
|
+
} catch {
|
|
54
|
+
return undefined;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function cgroupPids(cgPath: string): number[] {
|
|
59
|
+
try {
|
|
60
|
+
return readFileSync(`${cgPath}/cgroup.procs`, "utf8")
|
|
61
|
+
.split("\n")
|
|
62
|
+
.map((line) => Number(line.trim()))
|
|
63
|
+
.filter((pid) => Number.isFinite(pid) && pid > 0);
|
|
64
|
+
} catch {
|
|
65
|
+
return [];
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/** Kill every process in the command's cgroup and remove the cgroup. `cgroup.kill` SIGKILLs the
|
|
70
|
+
* whole subtree atomically; we then wait out any lingering PID-table entries and rmdir (retrying
|
|
71
|
+
* a few times, never hanging) so we don't leak an empty cgroup dir. */
|
|
72
|
+
async function reapCommandCgroup(cgPath: string): Promise<void> {
|
|
73
|
+
try { writeFileSync(`${cgPath}/cgroup.kill`, "1"); } catch {}
|
|
74
|
+
await waitForPidsExit(cgroupPids(cgPath), DEFAULT_KILL_GRACE_MS);
|
|
75
|
+
for (const pid of cgroupPids(cgPath)) killPid(pid, "SIGKILL");
|
|
76
|
+
for (let attempt = 0; attempt < 5; attempt++) {
|
|
77
|
+
try { rmdirSync(cgPath); return; } catch {}
|
|
78
|
+
await sleep(50);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/** Fallback reaper for hosts without cgroup v2: the command was spawned detached (its own process
|
|
83
|
+
* group, pgid === proc.pid), so signal the whole group plus any still-living descendants. Reuses the
|
|
84
|
+
* shared process helpers. NOTE: a `setsid`-detached child escapes the process group entirely — only
|
|
85
|
+
* the cgroup path above reaps those; this is the best achievable without cgroups. */
|
|
86
|
+
async function reapDetachedProcessGroup(proc: Bun.Subprocess): Promise<void> {
|
|
87
|
+
try { process.kill(-proc.pid, "SIGTERM"); } catch {}
|
|
88
|
+
const pids = await processTreePids([proc.pid]).catch(() => [proc.pid]);
|
|
89
|
+
for (const pid of pids) killPid(pid, "SIGTERM");
|
|
90
|
+
const all = [proc.pid, ...pids];
|
|
91
|
+
if (await waitForPidsExit(all, DEFAULT_KILL_GRACE_MS)) return;
|
|
92
|
+
try { process.kill(-proc.pid, "SIGKILL"); } catch {}
|
|
93
|
+
for (const pid of all.filter(isPidAlive)) killPid(pid, "SIGKILL");
|
|
94
|
+
}
|
|
23
95
|
|
|
24
96
|
interface StreamCapture {
|
|
25
97
|
done: Promise<void>;
|
|
26
98
|
text(): string;
|
|
99
|
+
truncated(): boolean;
|
|
100
|
+
limitExceeded(): boolean;
|
|
27
101
|
cancel(): void;
|
|
28
102
|
}
|
|
29
103
|
|
|
30
|
-
function captureStream(
|
|
31
|
-
|
|
104
|
+
function captureStream(
|
|
105
|
+
stream: ReadableStream<Uint8Array> | undefined,
|
|
106
|
+
options: { maxOutputBytes?: number; onLimitExceeded?: () => void } = {},
|
|
107
|
+
): StreamCapture {
|
|
108
|
+
if (!stream) return { done: Promise.resolve(), text: () => "", truncated: () => false, limitExceeded: () => false, cancel: () => {} };
|
|
32
109
|
const reader = stream.getReader();
|
|
33
110
|
const decoder = new TextDecoder();
|
|
34
111
|
let output = "";
|
|
35
112
|
let canceled = false;
|
|
113
|
+
let truncated = false;
|
|
114
|
+
let limitExceeded = false;
|
|
36
115
|
const done = (async () => {
|
|
37
116
|
try {
|
|
38
117
|
while (!canceled) {
|
|
39
118
|
const chunk = await reader.read();
|
|
40
119
|
if (chunk.done) break;
|
|
41
120
|
output += decoder.decode(chunk.value, { stream: true });
|
|
121
|
+
if (options.maxOutputBytes && output.length > options.maxOutputBytes) {
|
|
122
|
+
truncated = true;
|
|
123
|
+
output = output.slice(output.length - options.maxOutputBytes);
|
|
124
|
+
if (!limitExceeded) {
|
|
125
|
+
limitExceeded = true;
|
|
126
|
+
options.onLimitExceeded?.();
|
|
127
|
+
}
|
|
128
|
+
}
|
|
42
129
|
}
|
|
43
130
|
output += decoder.decode();
|
|
131
|
+
if (options.maxOutputBytes && output.length > options.maxOutputBytes) {
|
|
132
|
+
truncated = true;
|
|
133
|
+
output = output.slice(output.length - options.maxOutputBytes);
|
|
134
|
+
}
|
|
44
135
|
} catch {
|
|
45
136
|
// Intentional cancellation on process timeout or a stuck post-exit pipe.
|
|
46
137
|
} finally {
|
|
@@ -50,6 +141,8 @@ function captureStream(stream: ReadableStream<Uint8Array> | undefined): StreamCa
|
|
|
50
141
|
return {
|
|
51
142
|
done,
|
|
52
143
|
text: () => output,
|
|
144
|
+
truncated: () => truncated,
|
|
145
|
+
limitExceeded: () => limitExceeded,
|
|
53
146
|
cancel: () => {
|
|
54
147
|
canceled = true;
|
|
55
148
|
void reader.cancel().catch(() => {});
|
|
@@ -74,35 +167,69 @@ function abortMessage(cmd: string[], options: ExecOptions): string {
|
|
|
74
167
|
return `${label} aborted`;
|
|
75
168
|
}
|
|
76
169
|
|
|
170
|
+
async function terminateProcessTree(proc: Bun.Subprocess): Promise<void> {
|
|
171
|
+
const pids = await processTreePids([proc.pid]).catch(() => [proc.pid]);
|
|
172
|
+
for (const target of pids) killPid(target, "SIGTERM");
|
|
173
|
+
try { proc.kill("SIGTERM"); } catch {}
|
|
174
|
+
const exited = await waitForPidsExit(pids, DEFAULT_KILL_GRACE_MS);
|
|
175
|
+
const alive = exited ? [] : pids.filter(isPidAlive);
|
|
176
|
+
for (const target of alive) killPid(target, "SIGKILL");
|
|
177
|
+
if (alive.includes(proc.pid)) {
|
|
178
|
+
try { proc.kill("SIGKILL"); } catch {}
|
|
179
|
+
}
|
|
180
|
+
if (alive.length > 0) await waitForPidsExit(alive, DEFAULT_KILL_GRACE_MS);
|
|
181
|
+
}
|
|
182
|
+
|
|
77
183
|
export async function execProcess(cmd: string[], options: ExecOptions = {}): Promise<ExecResult> {
|
|
78
184
|
if (options.signal?.aborted) {
|
|
79
185
|
return { ok: false, exitCode: null, stdout: "", stderr: abortMessage(cmd, options) };
|
|
80
186
|
}
|
|
81
|
-
|
|
187
|
+
// #968 hole 2 — for reap-on-every-exit commands, confine the whole tree to a dedicated cgroup so a
|
|
188
|
+
// `setsid`-detached child can't survive the parent. The wrapper joins the cgroup (writes its own
|
|
189
|
+
// pid to cgroup.procs) BEFORE exec-ing the real command, so every descendant is a member from birth
|
|
190
|
+
// (race-free); `exec "$0" "$@"` passes the original argv through untouched.
|
|
191
|
+
const cgroup = options.reapProcessGroup ? createCommandCgroup() : undefined;
|
|
192
|
+
const spawnCmd = cgroup
|
|
193
|
+
? ["sh", "-c", `echo $$ > ${cgroup}/cgroup.procs 2>/dev/null || true; exec "$0" "$@"`, ...cmd]
|
|
194
|
+
: cmd;
|
|
195
|
+
const proc = Bun.spawn(spawnCmd, {
|
|
82
196
|
cwd: options.cwd,
|
|
83
197
|
env: options.env,
|
|
84
198
|
stdin: "ignore",
|
|
85
199
|
stdout: options.stdout ?? "pipe",
|
|
86
200
|
stderr: options.stderr ?? "pipe",
|
|
201
|
+
// Own process group so the fallback reaper can group-signal without touching the orchestrator.
|
|
202
|
+
...(options.reapProcessGroup ? { detached: true } : {}),
|
|
87
203
|
});
|
|
88
|
-
const stdoutCapture = options.stdout === "ignore" ? captureStream(undefined) : captureStream(proc.stdout);
|
|
89
|
-
const stderrCapture = options.stderr === "ignore" ? captureStream(undefined) : captureStream(proc.stderr);
|
|
90
204
|
let timedOut = false;
|
|
91
205
|
let aborted = false;
|
|
206
|
+
let outputLimitExceeded = false;
|
|
207
|
+
let abortStarted = false;
|
|
92
208
|
let timeout: ReturnType<typeof setTimeout> | undefined;
|
|
93
|
-
let killTimeout: ReturnType<typeof setTimeout> | undefined;
|
|
94
209
|
let timeoutResolve: ((value: null) => void) | undefined;
|
|
95
210
|
const timeoutPromise = new Promise<null>((resolve) => { timeoutResolve = resolve; });
|
|
96
|
-
|
|
211
|
+
let abortDone: Promise<void> = Promise.resolve();
|
|
212
|
+
const abortProcess = (killParentImmediately = false) => {
|
|
213
|
+
if (abortStarted) return;
|
|
214
|
+
abortStarted = true;
|
|
97
215
|
aborted = true;
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
216
|
+
if (killParentImmediately) {
|
|
217
|
+
try { proc.kill("SIGTERM"); } catch {}
|
|
218
|
+
}
|
|
219
|
+
abortDone = terminateProcessTree(proc).finally(() => timeoutResolve?.(null));
|
|
220
|
+
};
|
|
221
|
+
const abortForOutputLimit = () => {
|
|
222
|
+
outputLimitExceeded = true;
|
|
223
|
+
abortProcess();
|
|
104
224
|
};
|
|
105
|
-
|
|
225
|
+
const stdoutCapture = options.stdout === "ignore"
|
|
226
|
+
? captureStream(undefined)
|
|
227
|
+
: captureStream(proc.stdout, { maxOutputBytes: options.maxOutputBytes, onLimitExceeded: abortForOutputLimit });
|
|
228
|
+
const stderrCapture = options.stderr === "ignore"
|
|
229
|
+
? captureStream(undefined)
|
|
230
|
+
: captureStream(proc.stderr, { maxOutputBytes: options.maxOutputBytes, onLimitExceeded: abortForOutputLimit });
|
|
231
|
+
const abortFromSignal = () => abortProcess(true);
|
|
232
|
+
options.signal?.addEventListener("abort", abortFromSignal, { once: true });
|
|
106
233
|
if (options.timeoutMs && options.timeoutMs > 0) {
|
|
107
234
|
timeout = setTimeout(() => {
|
|
108
235
|
timedOut = true;
|
|
@@ -112,10 +239,10 @@ export async function execProcess(cmd: string[], options: ExecOptions = {}): Pro
|
|
|
112
239
|
}
|
|
113
240
|
|
|
114
241
|
const exitCode = await (timeout || options.signal ? Promise.race([proc.exited, timeoutPromise]) : proc.exited);
|
|
115
|
-
options.signal?.removeEventListener("abort",
|
|
242
|
+
options.signal?.removeEventListener("abort", abortFromSignal);
|
|
116
243
|
if (timeout) clearTimeout(timeout);
|
|
117
|
-
if (!timedOut && !aborted && killTimeout) clearTimeout(killTimeout);
|
|
118
244
|
if (timedOut || aborted) {
|
|
245
|
+
await abortDone;
|
|
119
246
|
stdoutCapture.cancel();
|
|
120
247
|
stderrCapture.cancel();
|
|
121
248
|
}
|
|
@@ -125,20 +252,32 @@ export async function execProcess(cmd: string[], options: ExecOptions = {}): Pro
|
|
|
125
252
|
stdoutCapture.cancel();
|
|
126
253
|
stderrCapture.cancel();
|
|
127
254
|
|
|
255
|
+
// #968 hole 2 — reap the command's whole process tree on EVERY exit path (normal exit included),
|
|
256
|
+
// so a backgrounded/daemonized child can't outlive the command. Runs regardless of exit reason.
|
|
257
|
+
if (options.reapProcessGroup) {
|
|
258
|
+
if (cgroup) await reapCommandCgroup(cgroup);
|
|
259
|
+
else await reapDetachedProcessGroup(proc);
|
|
260
|
+
}
|
|
261
|
+
|
|
128
262
|
const stdout = stdoutCapture.text();
|
|
129
263
|
let stderr = stderrCapture.text();
|
|
130
264
|
if (timedOut) {
|
|
131
265
|
const msg = timeoutMessage(cmd, options);
|
|
132
266
|
stderr = stderr ? `${stderr}\n${msg}` : msg;
|
|
267
|
+
} else if (outputLimitExceeded) {
|
|
268
|
+
const msg = `output exceeded ${options.maxOutputBytes} bytes`;
|
|
269
|
+
stderr = stderr ? `${stderr}\n${msg}` : msg;
|
|
133
270
|
} else if (aborted) {
|
|
134
271
|
const msg = abortMessage(cmd, options);
|
|
135
272
|
stderr = stderr ? `${stderr}\n${msg}` : msg;
|
|
136
273
|
}
|
|
137
274
|
return {
|
|
138
|
-
ok: !timedOut && !aborted && exitCode === 0,
|
|
275
|
+
ok: !timedOut && !aborted && !outputLimitExceeded && exitCode === 0,
|
|
139
276
|
exitCode,
|
|
140
277
|
stdout: options.trimStdout === false ? stdout : stdout.trim(),
|
|
141
278
|
stderr: options.trimStderr === false ? stderr : stderr.trim(),
|
|
142
279
|
...(timedOut ? { timedOut } : {}),
|
|
280
|
+
...(stdoutCapture.truncated() || stderrCapture.truncated() ? { outputTruncated: true } : {}),
|
|
281
|
+
...(outputLimitExceeded ? { outputLimitExceeded: true } : {}),
|
|
143
282
|
};
|
|
144
283
|
}
|
package/src/shared-callmux.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
|
|
2
2
|
import { homedir } from "node:os";
|
|
3
3
|
import { dirname, join } from "node:path";
|
|
4
|
-
import { errMessage, isRecord } from "agent-relay-sdk";
|
|
4
|
+
import { errMessage, isRecord, SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS } from "agent-relay-sdk";
|
|
5
5
|
import { createListener, type CallmuxConfig, type CreateListenerOptions, type ListenerHealthSnapshot, type ProgrammaticListener } from "callmux";
|
|
6
6
|
import type { OrchestratorConfig } from "./config";
|
|
7
7
|
import { agentRelayHome } from "./config";
|
|
@@ -78,14 +78,18 @@ export function sharedMcpListenerUrl(): string {
|
|
|
78
78
|
export function writeSharedCallmuxConfig(opts: Pick<SharedCallmuxOptions, "configPath" | "sourceConfigPath"> & { registryServers?: CallmuxConfig["servers"] }): CallmuxConfig {
|
|
79
79
|
const source = readJsonObject(opts.sourceConfigPath);
|
|
80
80
|
const sourceServers = isRecord(source.servers) ? source.servers : {};
|
|
81
|
-
const
|
|
81
|
+
const rawServers = opts.registryServers && Object.keys(opts.registryServers).length > 0
|
|
82
82
|
? cloneServers(opts.registryServers)
|
|
83
83
|
: fallbackSharedServers(sourceServers);
|
|
84
|
+
const servers = withSharedCallTimeoutFloor(rawServers);
|
|
84
85
|
const generated: CallmuxConfig = {
|
|
85
86
|
servers,
|
|
86
87
|
cacheTtlSeconds: numberFromRecord(source, "cacheTtlSeconds") ?? 10,
|
|
87
88
|
maxConcurrency: numberFromRecord(source, "maxConcurrency") ?? 20,
|
|
88
|
-
callTimeoutMs:
|
|
89
|
+
callTimeoutMs: Math.max(
|
|
90
|
+
numberFromRecord(source, "callTimeoutMs") ?? SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS,
|
|
91
|
+
SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS,
|
|
92
|
+
),
|
|
89
93
|
outputFormat: outputFormatFromRecord(source, "outputFormat") ?? "auto",
|
|
90
94
|
// Relay workers consume only proxied tokenlean+github tools; suppress callmux meta-tools.
|
|
91
95
|
exposeMetaTools: false,
|
|
@@ -305,6 +309,16 @@ function fallbackSharedServers(sourceServers: Record<string, unknown>): CallmuxC
|
|
|
305
309
|
};
|
|
306
310
|
}
|
|
307
311
|
|
|
312
|
+
function withSharedCallTimeoutFloor(servers: CallmuxConfig["servers"]): CallmuxConfig["servers"] {
|
|
313
|
+
return Object.fromEntries(Object.entries(servers).map(([name, server]) => {
|
|
314
|
+
const callTimeoutMs = Math.max(
|
|
315
|
+
server.callTimeoutMs ?? SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS,
|
|
316
|
+
SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS,
|
|
317
|
+
);
|
|
318
|
+
return [name, { ...server, callTimeoutMs }];
|
|
319
|
+
}));
|
|
320
|
+
}
|
|
321
|
+
|
|
308
322
|
function callmuxServerFromProvisioningVariant(value: unknown): { name: string; server: CallmuxConfig["servers"][string] } | null {
|
|
309
323
|
if (!isRecord(value) || value.enabled === false || value.approvalStatus === "pending" || value.validationStatus === "invalid") return null;
|
|
310
324
|
if (typeof value.name !== "string" || !value.name) return null;
|
|
@@ -383,7 +397,7 @@ function defaultGithubServer(): CallmuxConfig["servers"][string] {
|
|
|
383
397
|
command: "github-mcp-server",
|
|
384
398
|
args: ["stdio"],
|
|
385
399
|
prefix: "gh",
|
|
386
|
-
callTimeoutMs:
|
|
400
|
+
callTimeoutMs: SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS,
|
|
387
401
|
tools: GITHUB_TOOLS,
|
|
388
402
|
cachePolicy: { allowTools: ["issue_read", "list_issues", "search_issues", "search_code", "get_file_contents"] },
|
|
389
403
|
};
|