agent-relay-orchestrator 0.84.0 → 0.86.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/control.ts +14 -1
- package/src/spawn/supervisor.ts +37 -1
- package/src/workspace-probe/merge.ts +62 -11
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-relay-orchestrator",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.86.0",
|
|
4
4
|
"description": "Agent Relay orchestrator — manages agent lifecycle across hosts",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
"test": "bun test"
|
|
17
17
|
},
|
|
18
18
|
"dependencies": {
|
|
19
|
-
"agent-relay-sdk": "0.2.
|
|
19
|
+
"agent-relay-sdk": "0.2.66"
|
|
20
20
|
},
|
|
21
21
|
"devDependencies": {
|
|
22
22
|
"@types/bun": "latest",
|
package/src/control.ts
CHANGED
|
@@ -6,6 +6,17 @@ import { readLocalProviderConfigs } from "./provider-config-migration";
|
|
|
6
6
|
import { spawnAgent, stopSession, type SpawnOptions } from "./spawn";
|
|
7
7
|
import { cleanupWorkspace, discardRecoveryBranch, mergeWorkspace, pruneWorktrees, reconcileWorkspace, refreshWorkspaceDeps, workspacesRoot } from "./workspace-probe";
|
|
8
8
|
import { armWorkspacePrAutoMerge, mergeWorkspacePr, refreshWorkspacePrBranch } from "./workspace-pr";
|
|
9
|
+
import type { WorkspaceMergeResult } from "agent-relay-sdk";
|
|
10
|
+
|
|
11
|
+
// #638 — settle a `workspace.merge` command on whether it made progress, not on whether
|
|
12
|
+
// the merge function threw. Every no-progress outcome (origin moved ahead, an unpredicted
|
|
13
|
+
// replay conflict, a push race) returns merged:false with an `error` set; every genuine
|
|
14
|
+
// land / recycle / noop-resolve / PR-opened leaves `error` undefined. Reporting `succeeded`
|
|
15
|
+
// for an unlanded branch is what let the auto-merge job believe it was progressing and
|
|
16
|
+
// busy-loop the repo's merge lease. Mirrors the pr-merge / pr-arm settle contract.
|
|
17
|
+
export function mergeCommandStatus(result: WorkspaceMergeResult): "succeeded" | "failed" {
|
|
18
|
+
return result.error ? "failed" : "succeeded";
|
|
19
|
+
}
|
|
9
20
|
|
|
10
21
|
interface ControlHandler {
|
|
11
22
|
handleCommand(command: RelayCommand): Promise<boolean>;
|
|
@@ -132,7 +143,9 @@ export function createControlHandler(
|
|
|
132
143
|
}
|
|
133
144
|
: undefined,
|
|
134
145
|
});
|
|
135
|
-
|
|
146
|
+
// #638 — settle `failed` (carrying the error) for a no-op merge instead of
|
|
147
|
+
// `succeeded`; see mergeCommandStatus.
|
|
148
|
+
await relay.updateCommand(command.id, mergeCommandStatus(result), result as unknown as Record<string, unknown>, result.error);
|
|
136
149
|
} else if (command.type === "workspace.pr-arm-auto-merge") {
|
|
137
150
|
const rawPrNumber = command.params.prNumber;
|
|
138
151
|
const result = armWorkspacePrAutoMerge({
|
package/src/spawn/supervisor.ts
CHANGED
|
@@ -178,6 +178,37 @@ function logFileDiagnostics(logFile: string): Pick<ManagedSessionExitDiagnostics
|
|
|
178
178
|
}
|
|
179
179
|
}
|
|
180
180
|
|
|
181
|
+
/** Map systemd ExecMain* / Result + a best-effort journal OOM probe into signal/exitCode/oom (#636). */
|
|
182
|
+
function terminationDiagnostics(systemd: ManagedSessionExitDiagnostics["systemd"], pid?: number): Pick<ManagedSessionExitDiagnostics, "signal" | "exitCode" | "oom"> {
|
|
183
|
+
const out: Pick<ManagedSessionExitDiagnostics, "signal" | "exitCode" | "oom"> = {};
|
|
184
|
+
if (systemd?.result === "oom-kill") {
|
|
185
|
+
out.oom = { source: "systemd", detail: "Result=oom-kill" };
|
|
186
|
+
}
|
|
187
|
+
if (systemd?.execMainCode && /killed|dumped/i.test(systemd.execMainCode) && systemd.execMainStatus) {
|
|
188
|
+
out.signal = /^\d+$/.test(systemd.execMainStatus) ? `signal ${systemd.execMainStatus}` : `SIG${systemd.execMainStatus.replace(/^SIG/i, "")}`;
|
|
189
|
+
} else if (systemd?.execMainCode === "exited" && systemd.execMainStatus && /^\d+$/.test(systemd.execMainStatus)) {
|
|
190
|
+
out.exitCode = Number(systemd.execMainStatus);
|
|
191
|
+
}
|
|
192
|
+
// Best-effort kernel OOM-killer probe by pid — wrapped so a missing/permission-denied journal
|
|
193
|
+
// never breaks diagnosis. Only consulted when systemd didn't already flag the OOM.
|
|
194
|
+
if (!out.oom && pid && process.platform === "linux") {
|
|
195
|
+
try {
|
|
196
|
+
const result = Bun.spawnSync(["journalctl", "--user", "-k", "--no-pager", "-n", "200", "--grep", `Killed process ${pid}|Out of memory`], {
|
|
197
|
+
stdin: "ignore",
|
|
198
|
+
stdout: "pipe",
|
|
199
|
+
stderr: "ignore",
|
|
200
|
+
});
|
|
201
|
+
const text = result.exitCode === 0 ? result.stdout.toString() : "";
|
|
202
|
+
if (new RegExp(`Killed process ${pid}\\b|Out of memory: Killed process ${pid}\\b`).test(text)) {
|
|
203
|
+
out.oom = { source: "journal", detail: `OOM-killer hit pid ${pid}` };
|
|
204
|
+
}
|
|
205
|
+
} catch {
|
|
206
|
+
// journal unavailable — leave OOM unset.
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
return out;
|
|
210
|
+
}
|
|
211
|
+
|
|
181
212
|
function describeSessionExit(record: SessionRecord, diagnostics: Omit<ManagedSessionExitDiagnostics, "lastError">): string {
|
|
182
213
|
if (record.provider === "claude") {
|
|
183
214
|
const modelUnavailable = extractClaudeModelUnavailableMessage((diagnostics.logTail ?? []).join("\n"));
|
|
@@ -211,6 +242,8 @@ export function diagnoseSessionExit(input: { agentId?: string; policyName?: stri
|
|
|
211
242
|
const terminalAvailable = tmuxHasSession(record.name, readRunnerInfo(record)?.tmuxSocket);
|
|
212
243
|
const log = logFileDiagnostics(record.logFile);
|
|
213
244
|
const runnerInfoPresent = record.runnerInfoFile ? existsSync(record.runnerInfoFile) : false;
|
|
245
|
+
const systemd = supervisor.type === "systemd" && supervisor.unit ? systemdUnitDiagnostics(supervisor.unit) : undefined;
|
|
246
|
+
const termination = terminationDiagnostics(systemd, record.pid ?? currentPid);
|
|
214
247
|
const unavailable = [
|
|
215
248
|
...(log.logUnavailable ? [`stdout/stderr log unavailable: ${log.logUnavailable}`] : []),
|
|
216
249
|
...(log.logEmpty ? ["stdout/stderr log empty"] : []),
|
|
@@ -243,7 +276,10 @@ export function diagnoseSessionExit(input: { agentId?: string; policyName?: stri
|
|
|
243
276
|
logTail: log.logTail,
|
|
244
277
|
runnerInfoFile: record.runnerInfoFile,
|
|
245
278
|
runnerInfoPresent,
|
|
246
|
-
...(
|
|
279
|
+
...(systemd ? { systemd } : {}),
|
|
280
|
+
...(termination.signal ? { signal: termination.signal } : {}),
|
|
281
|
+
...(termination.exitCode !== undefined ? { exitCode: termination.exitCode } : {}),
|
|
282
|
+
...(termination.oom ? { oom: termination.oom } : {}),
|
|
247
283
|
...(unavailable.length ? { unavailable } : {}),
|
|
248
284
|
};
|
|
249
285
|
return {
|
|
@@ -421,6 +421,39 @@ function recordNoFfMerge(
|
|
|
421
421
|
return { ok: true, mergeSha };
|
|
422
422
|
}
|
|
423
423
|
|
|
424
|
+
/**
|
|
425
|
+
* Fast-forward the local `base` ref to its fetched `upstream` tip (#638 concurrent-lane
|
|
426
|
+
* recovery). The caller has verified base is a strict ancestor of upstream — a clean ff,
|
|
427
|
+
* no divergence, nothing to lose. When base is checked out in a worktree, ff it there so
|
|
428
|
+
* that working tree stays consistent (refuse if it's dirty — can't ff cleanly); otherwise
|
|
429
|
+
* advance the ref directly. Returns an error string only when the sync genuinely can't be
|
|
430
|
+
* performed, so the merge path can surface it as a no-progress failure.
|
|
431
|
+
*/
|
|
432
|
+
function syncLocalBaseToUpstream(
|
|
433
|
+
repoRoot: string,
|
|
434
|
+
worktreePath: string,
|
|
435
|
+
base: string,
|
|
436
|
+
upstream: string,
|
|
437
|
+
): { ok: true } | { ok: false; error: string } {
|
|
438
|
+
const upstreamSha = git(["rev-parse", "--verify", upstream], worktreePath).stdout;
|
|
439
|
+
if (!upstreamSha) return { ok: false, error: `cannot resolve ${upstream} to sync ${base}` };
|
|
440
|
+
const baseWorktree = worktreeForBranch(repoRoot, base);
|
|
441
|
+
// Sync IN the base worktree only when it's clean — that keeps its working tree consistent
|
|
442
|
+
// with the advanced ref (the pristine home-repo checkout). When the base worktree is dirty
|
|
443
|
+
// (#644: a human's WIP in the shared checkout) we must NOT refuse and stall the whole repo's
|
|
444
|
+
// lands: advance the ref directly with update-ref instead. This is immune to the checkout's
|
|
445
|
+
// state and never reads or writes the human's working tree — their uncommitted work is left
|
|
446
|
+
// exactly as-is (the ref moves underneath; their files on disk are untouched).
|
|
447
|
+
if (baseWorktree && !baseWorktree.dirty) {
|
|
448
|
+
const ff = git(["merge", "--ff-only", upstream], baseWorktree.path);
|
|
449
|
+
if (!ff.ok) return { ok: false, error: ff.stderr || `failed to fast-forward ${base} to ${upstream}` };
|
|
450
|
+
return { ok: true };
|
|
451
|
+
}
|
|
452
|
+
const update = git(["update-ref", `refs/heads/${base}`, upstreamSha], repoRoot);
|
|
453
|
+
if (!update.ok) return { ok: false, error: update.stderr || `failed to advance ${base} to ${upstream}` };
|
|
454
|
+
return { ok: true };
|
|
455
|
+
}
|
|
456
|
+
|
|
424
457
|
function mergeRebaseFf(
|
|
425
458
|
input: WorkspaceMergeInput,
|
|
426
459
|
worktreePath: string,
|
|
@@ -433,11 +466,20 @@ function mergeRebaseFf(
|
|
|
433
466
|
if (!base) return head({ status: "review_requested", error: "no base branch to merge into" });
|
|
434
467
|
if (!branch) return head({ status: "review_requested", error: "cannot determine agent branch" });
|
|
435
468
|
|
|
436
|
-
// Reconcile with origin before landing (#190/#203). When base tracks an
|
|
437
|
-
// upstream (e.g. main -> origin/main) and we'll push, fetch it and
|
|
438
|
-
// origin has moved ahead of local base
|
|
439
|
-
//
|
|
440
|
-
//
|
|
469
|
+
// Reconcile with origin before landing (#190/#203/#638). When base tracks an
|
|
470
|
+
// upstream (e.g. main -> origin/main) and we'll push, fetch it and check whether
|
|
471
|
+
// origin has moved ahead of local base.
|
|
472
|
+
//
|
|
473
|
+
// Origin-ahead is the COMMON case under concurrency (#638): a sibling lane lands
|
|
474
|
+
// and advances origin/<base> while this lane still sits on a stale local base.
|
|
475
|
+
// Refusing here floods the steward on every normal multi-lane batch. Instead:
|
|
476
|
+
// - If local base carries nothing origin lacks (it's a strict ancestor of the
|
|
477
|
+
// fetched upstream), fast-forward local base to the upstream and land onto the
|
|
478
|
+
// FRESH origin (the no-ff merge below ties the branch onto the advanced base).
|
|
479
|
+
// This is a clean recovery — no steward escalation.
|
|
480
|
+
// - Refuse ONLY on genuine divergence: local base has commits not on origin, so
|
|
481
|
+
// a sync would rewrite/discard published-or-local history. A real failure must
|
|
482
|
+
// still set `error` so the relay's no-progress backoff (#638) can engage.
|
|
441
483
|
const upstream = upstreamRef(worktreePath, base);
|
|
442
484
|
const slash = upstream ? upstream.indexOf("/") : -1;
|
|
443
485
|
const remote = slash > 0 ? upstream!.slice(0, slash) : undefined; // remote of a `remote/branch` upstream
|
|
@@ -445,7 +487,13 @@ function mergeRebaseFf(
|
|
|
445
487
|
if (upstream && remote && pushEnabled) {
|
|
446
488
|
git(["fetch", remote, base], worktreePath); // best-effort freshness; a stale ref can only under-detect divergence
|
|
447
489
|
if (!git(["merge-base", "--is-ancestor", upstream, base], worktreePath).ok) {
|
|
448
|
-
|
|
490
|
+
// Origin moved ahead. Sync-then-land iff local base is cleanly behind (ancestor
|
|
491
|
+
// of upstream); otherwise it's genuine divergence — refuse without mutating.
|
|
492
|
+
if (!git(["merge-base", "--is-ancestor", base, upstream], worktreePath).ok) {
|
|
493
|
+
return head({ status: "review_requested", error: `local ${base} has diverged from ${upstream} (commits not on origin); sync before landing` });
|
|
494
|
+
}
|
|
495
|
+
const synced = syncLocalBaseToUpstream(repoRoot, worktreePath, base, upstream);
|
|
496
|
+
if (!synced.ok) return head({ status: "review_requested", error: synced.error });
|
|
449
497
|
}
|
|
450
498
|
}
|
|
451
499
|
|
|
@@ -460,13 +508,16 @@ function mergeRebaseFf(
|
|
|
460
508
|
const behind = countBehind(worktreePath, base);
|
|
461
509
|
|
|
462
510
|
// Advance base. `baseTip` is base's new tip after the land: it equals headSha on a
|
|
463
|
-
// clean fast-forward, or the merge commit on a no-ff merge.
|
|
464
|
-
//
|
|
465
|
-
//
|
|
511
|
+
// clean fast-forward, or the merge commit on a no-ff merge. Operate IN the base worktree
|
|
512
|
+
// only when it exists AND is clean — that keeps its working tree consistent with the
|
|
513
|
+
// advanced ref (the pristine home-repo checkout). When the base worktree is dirty
|
|
514
|
+
// (#644: a human's WIP in the shared checkout) we must NOT refuse and stall every lane's
|
|
515
|
+
// land: fall through to ref-plumbing (update-ref / synthesized no-ff merge) below, which
|
|
516
|
+
// advances refs/heads/<base> without reading or touching that working tree. The human's
|
|
517
|
+
// uncommitted work is left exactly as-is — the ref moves underneath, their files untouched.
|
|
466
518
|
let baseTip = headSha;
|
|
467
519
|
const baseWorktree = worktreeForBranch(repoRoot, base);
|
|
468
|
-
if (baseWorktree) {
|
|
469
|
-
if (baseWorktree.dirty) return head({ status: "review_requested", error: `base branch '${base}' has uncommitted changes in ${baseWorktree.path}` });
|
|
520
|
+
if (baseWorktree && !baseWorktree.dirty) {
|
|
470
521
|
if (behind === 0) {
|
|
471
522
|
const ff = git(["merge", "--ff-only", branch], baseWorktree.path);
|
|
472
523
|
if (!ff.ok) return head({ status: "review_requested", error: ff.stderr || "fast-forward into base failed" });
|