agent-relay-orchestrator 0.83.1 → 0.85.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/control.ts +14 -1
- package/src/spawn/supervisor.ts +37 -1
- package/src/workspace-probe/merge.ts +49 -6
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-relay-orchestrator",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.85.0",
|
|
4
4
|
"description": "Agent Relay orchestrator — manages agent lifecycle across hosts",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
"test": "bun test"
|
|
17
17
|
},
|
|
18
18
|
"dependencies": {
|
|
19
|
-
"agent-relay-sdk": "0.2.
|
|
19
|
+
"agent-relay-sdk": "0.2.65"
|
|
20
20
|
},
|
|
21
21
|
"devDependencies": {
|
|
22
22
|
"@types/bun": "latest",
|
package/src/control.ts
CHANGED
|
@@ -6,6 +6,17 @@ import { readLocalProviderConfigs } from "./provider-config-migration";
|
|
|
6
6
|
import { spawnAgent, stopSession, type SpawnOptions } from "./spawn";
|
|
7
7
|
import { cleanupWorkspace, discardRecoveryBranch, mergeWorkspace, pruneWorktrees, reconcileWorkspace, refreshWorkspaceDeps, workspacesRoot } from "./workspace-probe";
|
|
8
8
|
import { armWorkspacePrAutoMerge, mergeWorkspacePr, refreshWorkspacePrBranch } from "./workspace-pr";
|
|
9
|
+
import type { WorkspaceMergeResult } from "agent-relay-sdk";
|
|
10
|
+
|
|
11
|
+
// #638 — settle a `workspace.merge` command on whether it made progress, not on whether
|
|
12
|
+
// the merge function threw. Every no-progress outcome (origin moved ahead, an unpredicted
|
|
13
|
+
// replay conflict, a push race) returns merged:false with an `error` set; every genuine
|
|
14
|
+
// land / recycle / noop-resolve / PR-opened leaves `error` undefined. Reporting `succeeded`
|
|
15
|
+
// for an unlanded branch is what let the auto-merge job believe it was progressing and
|
|
16
|
+
// busy-loop the repo's merge lease. Mirrors the pr-merge / pr-arm settle contract.
|
|
17
|
+
export function mergeCommandStatus(result: WorkspaceMergeResult): "succeeded" | "failed" {
|
|
18
|
+
return result.error ? "failed" : "succeeded";
|
|
19
|
+
}
|
|
9
20
|
|
|
10
21
|
interface ControlHandler {
|
|
11
22
|
handleCommand(command: RelayCommand): Promise<boolean>;
|
|
@@ -132,7 +143,9 @@ export function createControlHandler(
|
|
|
132
143
|
}
|
|
133
144
|
: undefined,
|
|
134
145
|
});
|
|
135
|
-
|
|
146
|
+
// #638 — settle `failed` (carrying the error) for a no-op merge instead of
|
|
147
|
+
// `succeeded`; see mergeCommandStatus.
|
|
148
|
+
await relay.updateCommand(command.id, mergeCommandStatus(result), result as unknown as Record<string, unknown>, result.error);
|
|
136
149
|
} else if (command.type === "workspace.pr-arm-auto-merge") {
|
|
137
150
|
const rawPrNumber = command.params.prNumber;
|
|
138
151
|
const result = armWorkspacePrAutoMerge({
|
package/src/spawn/supervisor.ts
CHANGED
|
@@ -178,6 +178,37 @@ function logFileDiagnostics(logFile: string): Pick<ManagedSessionExitDiagnostics
|
|
|
178
178
|
}
|
|
179
179
|
}
|
|
180
180
|
|
|
181
|
+
/** Map systemd ExecMain* / Result + a best-effort journal OOM probe into signal/exitCode/oom (#636). */
|
|
182
|
+
function terminationDiagnostics(systemd: ManagedSessionExitDiagnostics["systemd"], pid?: number): Pick<ManagedSessionExitDiagnostics, "signal" | "exitCode" | "oom"> {
|
|
183
|
+
const out: Pick<ManagedSessionExitDiagnostics, "signal" | "exitCode" | "oom"> = {};
|
|
184
|
+
if (systemd?.result === "oom-kill") {
|
|
185
|
+
out.oom = { source: "systemd", detail: "Result=oom-kill" };
|
|
186
|
+
}
|
|
187
|
+
if (systemd?.execMainCode && /killed|dumped/i.test(systemd.execMainCode) && systemd.execMainStatus) {
|
|
188
|
+
out.signal = /^\d+$/.test(systemd.execMainStatus) ? `signal ${systemd.execMainStatus}` : `SIG${systemd.execMainStatus.replace(/^SIG/i, "")}`;
|
|
189
|
+
} else if (systemd?.execMainCode === "exited" && systemd.execMainStatus && /^\d+$/.test(systemd.execMainStatus)) {
|
|
190
|
+
out.exitCode = Number(systemd.execMainStatus);
|
|
191
|
+
}
|
|
192
|
+
// Best-effort kernel OOM-killer probe by pid — wrapped so a missing/permission-denied journal
|
|
193
|
+
// never breaks diagnosis. Only consulted when systemd didn't already flag the OOM.
|
|
194
|
+
if (!out.oom && pid && process.platform === "linux") {
|
|
195
|
+
try {
|
|
196
|
+
const result = Bun.spawnSync(["journalctl", "--user", "-k", "--no-pager", "-n", "200", "--grep", `Killed process ${pid}|Out of memory`], {
|
|
197
|
+
stdin: "ignore",
|
|
198
|
+
stdout: "pipe",
|
|
199
|
+
stderr: "ignore",
|
|
200
|
+
});
|
|
201
|
+
const text = result.exitCode === 0 ? result.stdout.toString() : "";
|
|
202
|
+
if (new RegExp(`Killed process ${pid}\\b|Out of memory: Killed process ${pid}\\b`).test(text)) {
|
|
203
|
+
out.oom = { source: "journal", detail: `OOM-killer hit pid ${pid}` };
|
|
204
|
+
}
|
|
205
|
+
} catch {
|
|
206
|
+
// journal unavailable — leave OOM unset.
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
return out;
|
|
210
|
+
}
|
|
211
|
+
|
|
181
212
|
function describeSessionExit(record: SessionRecord, diagnostics: Omit<ManagedSessionExitDiagnostics, "lastError">): string {
|
|
182
213
|
if (record.provider === "claude") {
|
|
183
214
|
const modelUnavailable = extractClaudeModelUnavailableMessage((diagnostics.logTail ?? []).join("\n"));
|
|
@@ -211,6 +242,8 @@ export function diagnoseSessionExit(input: { agentId?: string; policyName?: stri
|
|
|
211
242
|
const terminalAvailable = tmuxHasSession(record.name, readRunnerInfo(record)?.tmuxSocket);
|
|
212
243
|
const log = logFileDiagnostics(record.logFile);
|
|
213
244
|
const runnerInfoPresent = record.runnerInfoFile ? existsSync(record.runnerInfoFile) : false;
|
|
245
|
+
const systemd = supervisor.type === "systemd" && supervisor.unit ? systemdUnitDiagnostics(supervisor.unit) : undefined;
|
|
246
|
+
const termination = terminationDiagnostics(systemd, record.pid ?? currentPid);
|
|
214
247
|
const unavailable = [
|
|
215
248
|
...(log.logUnavailable ? [`stdout/stderr log unavailable: ${log.logUnavailable}`] : []),
|
|
216
249
|
...(log.logEmpty ? ["stdout/stderr log empty"] : []),
|
|
@@ -243,7 +276,10 @@ export function diagnoseSessionExit(input: { agentId?: string; policyName?: stri
|
|
|
243
276
|
logTail: log.logTail,
|
|
244
277
|
runnerInfoFile: record.runnerInfoFile,
|
|
245
278
|
runnerInfoPresent,
|
|
246
|
-
...(
|
|
279
|
+
...(systemd ? { systemd } : {}),
|
|
280
|
+
...(termination.signal ? { signal: termination.signal } : {}),
|
|
281
|
+
...(termination.exitCode !== undefined ? { exitCode: termination.exitCode } : {}),
|
|
282
|
+
...(termination.oom ? { oom: termination.oom } : {}),
|
|
247
283
|
...(unavailable.length ? { unavailable } : {}),
|
|
248
284
|
};
|
|
249
285
|
return {
|
|
@@ -421,6 +421,34 @@ function recordNoFfMerge(
|
|
|
421
421
|
return { ok: true, mergeSha };
|
|
422
422
|
}
|
|
423
423
|
|
|
424
|
+
/**
|
|
425
|
+
* Fast-forward the local `base` ref to its fetched `upstream` tip (#638 concurrent-lane
|
|
426
|
+
* recovery). The caller has verified base is a strict ancestor of upstream — a clean ff,
|
|
427
|
+
* no divergence, nothing to lose. When base is checked out in a worktree, ff it there so
|
|
428
|
+
* that working tree stays consistent (refuse if it's dirty — can't ff cleanly); otherwise
|
|
429
|
+
* advance the ref directly. Returns an error string only when the sync genuinely can't be
|
|
430
|
+
* performed, so the merge path can surface it as a no-progress failure.
|
|
431
|
+
*/
|
|
432
|
+
function syncLocalBaseToUpstream(
|
|
433
|
+
repoRoot: string,
|
|
434
|
+
worktreePath: string,
|
|
435
|
+
base: string,
|
|
436
|
+
upstream: string,
|
|
437
|
+
): { ok: true } | { ok: false; error: string } {
|
|
438
|
+
const upstreamSha = git(["rev-parse", "--verify", upstream], worktreePath).stdout;
|
|
439
|
+
if (!upstreamSha) return { ok: false, error: `cannot resolve ${upstream} to sync ${base}` };
|
|
440
|
+
const baseWorktree = worktreeForBranch(repoRoot, base);
|
|
441
|
+
if (baseWorktree) {
|
|
442
|
+
if (baseWorktree.dirty) return { ok: false, error: `base branch '${base}' has uncommitted changes in ${baseWorktree.path}; cannot sync to ${upstream}` };
|
|
443
|
+
const ff = git(["merge", "--ff-only", upstream], baseWorktree.path);
|
|
444
|
+
if (!ff.ok) return { ok: false, error: ff.stderr || `failed to fast-forward ${base} to ${upstream}` };
|
|
445
|
+
return { ok: true };
|
|
446
|
+
}
|
|
447
|
+
const update = git(["update-ref", `refs/heads/${base}`, upstreamSha], repoRoot);
|
|
448
|
+
if (!update.ok) return { ok: false, error: update.stderr || `failed to advance ${base} to ${upstream}` };
|
|
449
|
+
return { ok: true };
|
|
450
|
+
}
|
|
451
|
+
|
|
424
452
|
function mergeRebaseFf(
|
|
425
453
|
input: WorkspaceMergeInput,
|
|
426
454
|
worktreePath: string,
|
|
@@ -433,11 +461,20 @@ function mergeRebaseFf(
|
|
|
433
461
|
if (!base) return head({ status: "review_requested", error: "no base branch to merge into" });
|
|
434
462
|
if (!branch) return head({ status: "review_requested", error: "cannot determine agent branch" });
|
|
435
463
|
|
|
436
|
-
// Reconcile with origin before landing (#190/#203). When base tracks an
|
|
437
|
-
// upstream (e.g. main -> origin/main) and we'll push, fetch it and
|
|
438
|
-
// origin has moved ahead of local base
|
|
439
|
-
//
|
|
440
|
-
//
|
|
464
|
+
// Reconcile with origin before landing (#190/#203/#638). When base tracks an
|
|
465
|
+
// upstream (e.g. main -> origin/main) and we'll push, fetch it and check whether
|
|
466
|
+
// origin has moved ahead of local base.
|
|
467
|
+
//
|
|
468
|
+
// Origin-ahead is the COMMON case under concurrency (#638): a sibling lane lands
|
|
469
|
+
// and advances origin/<base> while this lane still sits on a stale local base.
|
|
470
|
+
// Refusing here floods the steward on every normal multi-lane batch. Instead:
|
|
471
|
+
// - If local base carries nothing origin lacks (it's a strict ancestor of the
|
|
472
|
+
// fetched upstream), fast-forward local base to the upstream and land onto the
|
|
473
|
+
// FRESH origin (the no-ff merge below ties the branch onto the advanced base).
|
|
474
|
+
// This is a clean recovery — no steward escalation.
|
|
475
|
+
// - Refuse ONLY on genuine divergence: local base has commits not on origin, so
|
|
476
|
+
// a sync would rewrite/discard published-or-local history. A real failure must
|
|
477
|
+
// still set `error` so the relay's no-progress backoff (#638) can engage.
|
|
441
478
|
const upstream = upstreamRef(worktreePath, base);
|
|
442
479
|
const slash = upstream ? upstream.indexOf("/") : -1;
|
|
443
480
|
const remote = slash > 0 ? upstream!.slice(0, slash) : undefined; // remote of a `remote/branch` upstream
|
|
@@ -445,7 +482,13 @@ function mergeRebaseFf(
|
|
|
445
482
|
if (upstream && remote && pushEnabled) {
|
|
446
483
|
git(["fetch", remote, base], worktreePath); // best-effort freshness; a stale ref can only under-detect divergence
|
|
447
484
|
if (!git(["merge-base", "--is-ancestor", upstream, base], worktreePath).ok) {
|
|
448
|
-
|
|
485
|
+
// Origin moved ahead. Sync-then-land iff local base is cleanly behind (ancestor
|
|
486
|
+
// of upstream); otherwise it's genuine divergence — refuse without mutating.
|
|
487
|
+
if (!git(["merge-base", "--is-ancestor", base, upstream], worktreePath).ok) {
|
|
488
|
+
return head({ status: "review_requested", error: `local ${base} has diverged from ${upstream} (commits not on origin); sync before landing` });
|
|
489
|
+
}
|
|
490
|
+
const synced = syncLocalBaseToUpstream(repoRoot, worktreePath, base, upstream);
|
|
491
|
+
if (!synced.ok) return head({ status: "review_requested", error: synced.error });
|
|
449
492
|
}
|
|
450
493
|
}
|
|
451
494
|
|