agent-relay-server 0.11.6 → 0.11.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,22 +9,31 @@ import {
9
9
  createActivityEvent,
10
10
  evaluatePoolBindings,
11
11
  expireQueuedMessages,
12
+ getAgent,
12
13
  getDb,
14
+ getRepoSteward,
15
+ getWorkspace,
13
16
  listOrchestrators,
14
17
  listWorkspaces,
18
+ patchWorkspaceMetadata,
15
19
  pruneOfflineAgents,
16
20
  pruneOldMessages,
17
21
  deleteWorkspace,
18
22
  pruneOrphanedSharedWorkspaces,
19
23
  reapStaleAgents,
20
24
  reapStaleOrchestrators,
25
+ reelectRepoSteward,
21
26
  releaseExpiredClaims,
27
+ releaseExpiredMergeLeases,
22
28
  releaseOrphanedTasks,
23
29
  sendMessage,
24
30
  sweepArtifacts,
25
31
  updateWorkspaceStatus,
26
32
  } from "./db";
27
33
  import type { WorkspaceMergePreview, WorkspaceRecord, WorkspaceStatus } from "./types";
34
+ import { requestWorkspaceMerge } from "./workspace-merge";
35
+ import { getStewardConfig } from "./config-store";
36
+ import { ensureRepoSteward } from "./steward";
28
37
  import { emitRelayEvent } from "./events";
29
38
  import { getLifecycleManager } from "./lifecycle-manager";
30
39
  import { applyCommandToRecipe } from "./recipe-runner";
@@ -49,10 +58,58 @@ const CONFLICT_SCAN_INTERVAL_MS = Number(process.env.AGENT_RELAY_CONFLICT_SCAN_I
49
58
  const WORKSPACE_RETENTION_MS = Number(process.env.AGENT_RELAY_WORKSPACE_RETENTION_MS) || DAY_MS;
50
59
  const WORKSPACE_REVIEW_TTL_MS = Number(process.env.AGENT_RELAY_WORKSPACE_REVIEW_TTL_MS) || 3 * DAY_MS;
51
60
  const WORKSPACE_GC_INTERVAL_MS = Number(process.env.AGENT_RELAY_WORKSPACE_GC_INTERVAL_MS) || 60 * 60 * 1000;
61
+ // Deterministic auto-land (Layer 0): merge clean fast-forwards with no human in
62
+ // the loop. Default on for the seamless workflow; set AGENT_RELAY_WORKSPACE_AUTO_MERGE=0
63
+ // to require a manual or steward merge per repo. Read at call-time so operators can
64
+ // toggle it without a restart.
65
+ const WORKSPACE_AUTO_MERGE_INTERVAL_MS = Number(process.env.AGENT_RELAY_WORKSPACE_AUTO_MERGE_INTERVAL_MS) || CONFLICT_SCAN_INTERVAL_MS;
66
+ // Don't re-wake the managed steward for the same workspace more than once per
67
+ // this window — a persistent conflict/behind row would otherwise re-ping every sweep.
68
+ const STEWARD_WAKE_COOLDOWN_MS = Number(process.env.AGENT_RELAY_STEWARD_WAKE_COOLDOWN_MS) || 10 * 60 * 1000;
69
+ // How long a stranded review_requested/conflict worktree (no online steward) may
70
+ // sit before escalating to the configured fallback target, and the durable
71
+ // escalation target itself (`policy:<name>`, `label:<name>`, `cap:<name>`, an
72
+ // agent id, or `broadcast`). Read at call-time so config changes take effect
73
+ // without a restart (issue #157).
74
+ const stewardEscalationMs = () => Number(process.env.AGENT_RELAY_WORKSPACE_STEWARD_ESCALATION_MS) || 60 * 60 * 1000;
75
+ const stewardFallbackTarget = () => (process.env.AGENT_RELAY_WORKSPACE_STEWARD_FALLBACK || "").trim();
76
+ // Statuses that need an owner — a stranded one of these is what escalation rescues.
77
+ const STRANDABLE_STATUSES = new Set<WorkspaceStatus>(["review_requested", "conflict"]);
52
78
  // Live statuses worth scanning. Terminal (cleaned/merged/abandoned) and
53
79
  // in-flight (cleanup_requested) states are skipped.
54
80
  const CONFLICT_SCAN_STATUSES = new Set<WorkspaceStatus>(["active", "ready", "review_requested", "merge_planned", "conflict"]);
55
81
  const TERMINAL_WORKSPACE_STATUSES = new Set<WorkspaceStatus>(["cleaned", "merged", "abandoned"]);
82
+ // In-flight merge statuses that should reconcile to `merged` once the host
83
+ // reports the branch's work has landed in base (squash/cherry-pick, or a merged
84
+ // PR). Excludes active/ready: an agent still working may have landed an early
85
+ // commit while more work is in flight — don't yank its workspace out from under it.
86
+ const LANDED_RECONCILE_STATUSES = new Set<WorkspaceStatus>(["merge_planned", "review_requested", "conflict"]);
87
+
88
+ // Orphaned-session reaper. A spawned agent's process can outlive its relay
89
+ // presence: the relay agent goes offline/pruned but the orchestrator still
90
+ // reports the session's process running, so it lingers forever (visible under the
91
+ // orchestrator, gone from the Agents panel). Runtime-token self-heal recovers the
92
+ // recoverable ones; this is the backstop that stops the genuinely stuck ones.
93
+ // Conservative by design — a session must be observed continuously orphaned by
94
+ // THIS relay for the grace window before it is reaped, and the tracker is in-memory
95
+ // so a relay restart restarts the clock (giving self-heal first crack every time).
96
+ const ORPHAN_REAPER_INTERVAL_MS = Number(process.env.AGENT_RELAY_ORPHAN_REAPER_INTERVAL_MS) || 5 * 60 * 1000;
97
+ // Read at call-time so changes take effect without a restart (and so tests can tune
98
+ // them). Parsed to allow an explicit 0 (immediate) — `|| default` would reject it.
99
+ const envMsOrDefault = (name: string, fallback: number): number => {
100
+ const v = Number(process.env[name]);
101
+ return Number.isFinite(v) && v >= 0 ? v : fallback;
102
+ };
103
+ const orphanGraceMs = () => envMsOrDefault("AGENT_RELAY_ORPHAN_GRACE_MS", 30 * 60 * 1000);
104
+ const orphanReapCooldownMs = () => envMsOrDefault("AGENT_RELAY_ORPHAN_REAP_COOLDOWN_MS", 5 * 60 * 1000);
105
+ // Set AGENT_RELAY_ORPHAN_REAP=0 to detect + log orphans but never stop them.
106
+ const orphanReapEnabled = () => process.env.AGENT_RELAY_ORPHAN_REAP !== "0";
107
+ // orchestratorId + session identity -> when we first saw it orphaned (and last reaped).
108
+ const orphanTracker = new Map<string, { firstOrphanedAt: number; lastReapAt?: number }>();
109
+
110
+ export function resetOrphanTrackerForTests(): void {
111
+ orphanTracker.clear();
112
+ }
56
113
 
57
114
  interface MaintenanceJobDefinition {
58
115
  id: string;
@@ -205,6 +262,14 @@ const definitions: MaintenanceJobDefinition[] = [
205
262
  return { prunedAgentIds };
206
263
  },
207
264
  },
265
+ {
266
+ id: "orphaned-session-reaper",
267
+ title: "Orphaned session reaper",
268
+ description: "Stop spawned sessions whose relay agent is offline/gone but whose process the orchestrator still reports running, after a grace period for self-heal.",
269
+ intervalMs: ORPHAN_REAPER_INTERVAL_MS,
270
+ runOnStart: false,
271
+ handler: reapOrphanedSessions,
272
+ },
208
273
  {
209
274
  id: "orchestrator-reaper",
210
275
  title: "Orchestrator reaper",
@@ -305,6 +370,15 @@ const definitions: MaintenanceJobDefinition[] = [
305
370
  timeoutMs: 60 * 1000,
306
371
  handler: scanWorkspaceConflicts,
307
372
  },
373
+ {
374
+ id: "workspace-auto-merge",
375
+ title: "Workspace auto-merge",
376
+ description: "Auto-merge clean fast-forward review_requested worktrees into base under the per-repo lease; conflicts and diverged bases are left for the steward.",
377
+ intervalMs: WORKSPACE_AUTO_MERGE_INTERVAL_MS,
378
+ runOnStart: false,
379
+ timeoutMs: 60 * 1000,
380
+ handler: autoMergeCleanFastForwards,
381
+ },
308
382
  {
309
383
  id: "workspace-gc",
310
384
  title: "Workspace GC",
@@ -323,7 +397,7 @@ function workspacePathWithinBase(path: string | undefined, baseDir: string | und
323
397
  }
324
398
 
325
399
  async function fetchHostMergePreview(apiUrl: string, workspace: WorkspaceRecord): Promise<WorkspaceMergePreview | { available: false } | null> {
326
- const query = new URLSearchParams({ path: workspace.worktreePath });
400
+ const query = new URLSearchParams({ path: workspace.worktreePath, checkPr: "1" });
327
401
  if (workspace.baseRef) query.set("baseRef", workspace.baseRef);
328
402
  if (workspace.baseSha) query.set("baseSha", workspace.baseSha);
329
403
  const headers: Record<string, string> = {};
@@ -342,6 +416,118 @@ async function fetchHostMergePreview(apiUrl: string, workspace: WorkspaceRecord)
342
416
  // cleanly. Auto-flag `conflict` when a clean merge is no longer possible, and
343
417
  // auto-clear conflicts we set ourselves once they resolve (restoring the prior
344
418
  // status). Human-set conflicts are never cleared.
419
+ // Stop orphaned spawned sessions: process reported alive by the orchestrator, but
420
+ // the relay agent is offline/pruned and self-heal has had its chance. See the
421
+ // ORPHAN_* notes above. Covers both policy-managed and dashboard/ad-hoc spawns by
422
+ // iterating the orchestrators' reported managedAgents directly.
423
+ function reapOrphanedSessions(): Record<string, unknown> {
424
+ const now = Date.now();
425
+ const grace = orphanGraceMs();
426
+ const cooldown = orphanReapCooldownMs();
427
+ const reapEnabled = orphanReapEnabled();
428
+ const seen = new Set<string>();
429
+ const reaped: string[] = [];
430
+ let orphaned = 0;
431
+
432
+ for (const orch of listOrchestrators()) {
433
+ if (orch.status !== "online") continue; // can't trust the report or deliver the stop
434
+ for (const agent of orch.managedAgents) {
435
+ const sessionId = agent.spawnRequestId || agent.tmuxSession || agent.sessionName || agent.agentId;
436
+ if (!sessionId) continue;
437
+ const key = `${orch.id}:${sessionId}`;
438
+ const relayAgent = agent.agentId ? getAgent(agent.agentId) : null;
439
+ // Orphan = orchestrator reports the process running, but no live relay agent.
440
+ // "stale" is a recent/borderline disconnect — treat as alive and give it time;
441
+ // it will either recover or progress to "offline" and be caught next pass.
442
+ const isOrphan = !relayAgent || relayAgent.status === "offline";
443
+ if (!isOrphan) { orphanTracker.delete(key); continue; }
444
+ seen.add(key);
445
+ orphaned++;
446
+ const entry = orphanTracker.get(key) ?? { firstOrphanedAt: now };
447
+ orphanTracker.set(key, entry);
448
+ if (now - entry.firstOrphanedAt < grace) continue; // let self-heal recover it first
449
+ if (!reapEnabled) continue; // detect-only mode
450
+ if (entry.lastReapAt && now - entry.lastReapAt < cooldown) continue; // don't spam shutdowns
451
+ entry.lastReapAt = now;
452
+ const command = createCommand({
453
+ type: "agent.shutdown",
454
+ source: "system",
455
+ target: orch.agentId,
456
+ correlationId: agent.spawnRequestId,
457
+ params: {
458
+ action: "shutdown",
459
+ agentId: agent.agentId,
460
+ spawnRequestId: agent.spawnRequestId,
461
+ sessionName: agent.sessionName,
462
+ tmuxSession: agent.tmuxSession,
463
+ policyName: agent.policyName,
464
+ graceful: false,
465
+ timeoutMs: 10_000,
466
+ reason: "orphaned-session-reaper",
467
+ requestedBy: "orphaned-session-reaper",
468
+ requestedAt: now,
469
+ orchestratorId: orch.id,
470
+ },
471
+ });
472
+ emitRelayEvent({ type: "command.requested", source: "system", subject: command.id, data: { command } });
473
+ createActivityEvent({
474
+ clientId: `orphaned-session-reaper-${key}-${now}`,
475
+ kind: "state",
476
+ title: "Orphaned session reaped",
477
+ body: `${agent.label ?? agent.agentId ?? sessionId}: process still running on ${orch.id}, but its relay agent has been offline > ${Math.round(grace / 60000)}m and did not self-heal — stopping it`,
478
+ meta: agent.label ?? agent.agentId ?? sessionId,
479
+ icon: "ti-ghost",
480
+ view: "orchestrators",
481
+ agentId: agent.agentId || undefined,
482
+ metadata: {
483
+ source: "server",
484
+ maintenanceJobId: "orphaned-session-reaper",
485
+ orchestratorId: orch.id,
486
+ agentId: agent.agentId,
487
+ spawnRequestId: agent.spawnRequestId,
488
+ tmuxSession: agent.tmuxSession,
489
+ commandId: command.id,
490
+ orphanAgeMs: now - entry.firstOrphanedAt,
491
+ },
492
+ });
493
+ reaped.push(key);
494
+ }
495
+ }
496
+ // Forget sessions that recovered or are no longer reported, so a future orphaning
497
+ // of the same session starts a fresh grace window.
498
+ for (const key of orphanTracker.keys()) if (!seen.has(key)) orphanTracker.delete(key);
499
+ return { orphaned, reaped, tracked: orphanTracker.size, reapEnabled };
500
+ }
501
+
502
+ // Wake the managed per-repo steward (issue #167) for a workspace it should handle:
503
+ // auto-provision the policy from global steward config, then queue a `policy:` wake
504
+ // message (which also spawns the on-demand agent now via onMessageForPolicy). Honors a
505
+ // per-workspace cooldown so a persistent conflict/behind row isn't re-pinged every sweep.
506
+ // Returns the steward policy name on a fresh wake, or null (disabled / no owner / cooled down).
507
+ function wakeRepoSteward(ws: WorkspaceRecord, reason: string): string | null {
508
+ const meta = ws.metadata as Record<string, unknown>;
509
+ const lastWoke = typeof meta.stewardWokenAt === "number" ? meta.stewardWokenAt : 0;
510
+ if (lastWoke && Date.now() - lastWoke < STEWARD_WAKE_COOLDOWN_MS) return null;
511
+ const policyName = ensureRepoSteward(ws.repoRoot);
512
+ if (!policyName) return null;
513
+ try {
514
+ const msg = sendMessage({
515
+ from: "system",
516
+ to: `policy:${policyName}`,
517
+ kind: "system",
518
+ subject: `Steward: ${ws.status} workspace needs attention`,
519
+ body: `Workspace \`${ws.branch ?? ws.id}\` in ${ws.repoRoot} is ${ws.status} and could not auto-land (${reason}). cd into ${ws.worktreePath}, rebase onto ${ws.baseRef ?? "base"}, resolve, run checks, then land it via POST /api/workspaces/${ws.id}/actions {"action":"merge","strategy":"rebase-ff"} — or escalate if you can't.`,
520
+ payload: { kind: "workspace.steward-task", workspaceId: ws.id, repoRoot: ws.repoRoot, worktreePath: ws.worktreePath, branch: ws.branch, baseRef: ws.baseRef, status: ws.status, reason },
521
+ });
522
+ emitNewMessage(msg);
523
+ getLifecycleManager().onMessageForPolicy(policyName);
524
+ patchWorkspaceMetadata(ws.id, { stewardWokenAt: Date.now(), stewardPolicy: policyName });
525
+ return policyName;
526
+ } catch {
527
+ return null;
528
+ }
529
+ }
530
+
345
531
  async function scanWorkspaceConflicts(): Promise<Record<string, unknown>> {
346
532
  const orchestrators = listOrchestrators().filter((orch) => orch.status === "online" && orch.apiUrl);
347
533
  if (!orchestrators.length) return { scanned: 0, skipped: "no online orchestrators" };
@@ -351,6 +537,7 @@ async function scanWorkspaceConflicts(): Promise<Record<string, unknown>> {
351
537
  );
352
538
  const flagged: string[] = [];
353
539
  const cleared: string[] = [];
540
+ const merged: string[] = [];
354
541
  const notifiedStewards: string[] = [];
355
542
 
356
543
  for (const ws of candidates) {
@@ -362,6 +549,37 @@ async function scanWorkspaceConflicts(): Promise<Record<string, unknown>> {
362
549
  if (p.error || p.missing || p.conflict === undefined) continue;
363
550
 
364
551
  const meta = ws.metadata as Record<string, unknown>;
552
+
553
+ // Landing wins over everything else. Once the work is in base — whether the
554
+ // PR was squash/cherry-pick merged on GitHub or fast-forwarded locally — the
555
+ // workspace is done, even if `git merge-tree` still predicts a textual
556
+ // conflict against the now-moved base (a PR-strategy row sits at
557
+ // merge_planned forever otherwise, and the conflict scan can even pin a
558
+ // landed branch to `conflict`). Reconcile to the terminal `merged` status so
559
+ // the dashboard stops showing it as unmerged and GC prunes it on schedule.
560
+ const landed = p.landed === true || p.prMerged === true;
561
+ if (landed && LANDED_RECONCILE_STATUSES.has(ws.status)) {
562
+ updateWorkspaceStatus(ws.id, "merged", {
563
+ autoMerged: true,
564
+ mergedFromStatus: ws.status,
565
+ landedDetectedAt: Date.now(),
566
+ landedVia: p.prMerged === true ? "pr" : "git",
567
+ autoConflict: false,
568
+ });
569
+ merged.push(ws.id);
570
+ createActivityEvent({
571
+ clientId: "server-workspace-" + ws.id + "-merged-" + Date.now(),
572
+ kind: "state",
573
+ title: "Workspace work landed in base",
574
+ body: `${ws.branch ?? ws.id} is ${p.prMerged === true ? "merged on the remote (PR)" : "already merged into base"} ${p.baseRef ? `(${p.baseRef})` : ""} — marking merged`,
575
+ meta: ws.branch ?? ws.id,
576
+ icon: "ti-git-merge",
577
+ view: "orchestrators",
578
+ metadata: { source: "server", maintenanceJobId: "workspace-conflict-scan", workspaceId: ws.id, fromStatus: ws.status },
579
+ });
580
+ continue;
581
+ }
582
+
365
583
  if (p.conflict === true && ws.status !== "conflict") {
366
584
  updateWorkspaceStatus(ws.id, "conflict", {
367
585
  autoConflict: true,
@@ -382,10 +600,15 @@ async function scanWorkspaceConflicts(): Promise<Record<string, unknown>> {
382
600
  view: "orchestrators",
383
601
  metadata: { source: "server", maintenanceJobId: "workspace-conflict-scan", workspaceId: ws.id, ahead: p.ahead, behind: p.behind },
384
602
  });
385
- // The steward is the repo's coordination point ping it so a conflict
386
- // gets resolved instead of silently rotting until merge time. Once-per-
387
- // onset (we only enter this branch on the active→conflict transition).
388
- if (ws.stewardAgentId) {
603
+ // Hand the conflict to a steward so it gets resolved instead of rotting
604
+ // until merge time. Once-per-onset (we only enter this branch on the
605
+ // active→conflict transition). When managed stewards are enabled, wake the
606
+ // auto-provisioned per-repo steward agent (#167); otherwise fall back to the
607
+ // legacy direct ping of the elected steward agent.
608
+ if (getStewardConfig().enabled) {
609
+ const woke = wakeRepoSteward(getWorkspace(ws.id) ?? ws, "conflict");
610
+ if (woke) notifiedStewards.push(woke);
611
+ } else if (ws.stewardAgentId) {
389
612
  try {
390
613
  const msg = sendMessage({
391
614
  from: "system",
@@ -410,7 +633,86 @@ async function scanWorkspaceConflicts(): Promise<Record<string, unknown>> {
410
633
  }
411
634
  }
412
635
 
413
- return { scanned: candidates.length, flagged, cleared, notifiedStewards };
636
+ return { scanned: candidates.length, flagged, cleared, merged, notifiedStewards };
637
+ }
638
+
639
+ // Deterministic auto-land (Layer 0, issue #167). Walk the "ready to land" queue
640
+ // (`review_requested` isolated worktrees) and, for any whose work is a strict
641
+ // clean fast-forward (no conflict, base hasn't moved, real commits ahead), land
642
+ // it via the shared merge helper — the same lease-serialized path the merge route
643
+ // uses. Conflicts and diverged bases (`behind>0`, even if cleanly rebasable) are
644
+ // deliberately left for the steward (a human or, later, the managed steward
645
+ // agent): per the chosen "Clean FF immediate" gate, anything needing a rebase or
646
+ // conflict reasoning is not auto-landed. No agent in the loop for the easy case.
647
+ async function autoMergeCleanFastForwards(): Promise<Record<string, unknown>> {
648
+ if (process.env.AGENT_RELAY_WORKSPACE_AUTO_MERGE === "0") return { skipped: "disabled" };
649
+ const orchestrators = listOrchestrators().filter((orch) => orch.status === "online" && orch.apiUrl);
650
+ if (!orchestrators.length) return { scanned: 0, skipped: "no online orchestrators" };
651
+
652
+ const candidates = listWorkspaces().filter(
653
+ (ws) => ws.mode === "isolated" && Boolean(ws.worktreePath) && ws.status === "review_requested",
654
+ );
655
+ const stewardEnabled = getStewardConfig().enabled;
656
+ const merged: string[] = [];
657
+ const heldByLease: string[] = [];
658
+ const leftForSteward: string[] = [];
659
+ const wokeStewards: string[] = [];
660
+
661
+ for (const ws of candidates) {
662
+ const orch = orchestrators.find((candidate) => workspacePathWithinBase(ws.sourceCwd, candidate.baseDir));
663
+ if (!orch?.apiUrl) continue;
664
+ const preview = await fetchHostMergePreview(orch.apiUrl, ws);
665
+ if (!preview || (preview as { available?: false }).available === false) continue;
666
+ const p = preview as WorkspaceMergePreview;
667
+ if (p.error || p.missing) continue;
668
+
669
+ const ahead = p.unmergedAhead ?? p.ahead ?? 0;
670
+ const cleanFF = p.cleanFastForward === true && p.conflict !== true && (p.behind ?? 0) === 0 && ahead > 0;
671
+ if (!cleanFF) {
672
+ // Base moved on (behind>0) or conflict — needs reasoning/rebase, which is the
673
+ // steward's job. Wake the managed steward when enabled (cooldown-guarded);
674
+ // otherwise leave it for conflict-scan's legacy ping / human review.
675
+ leftForSteward.push(ws.id);
676
+ if (stewardEnabled) {
677
+ const woke = wakeRepoSteward(ws, (p.behind ?? 0) > 0 ? "base moved on (behind>0)" : "conflict");
678
+ if (woke) wokeStewards.push(woke);
679
+ }
680
+ continue;
681
+ }
682
+
683
+ const result = requestWorkspaceMerge(ws, { strategy: "rebase-ff", requestedBy: "auto-merge" });
684
+ if (!result.ok) {
685
+ // 409 = another merge holds this repo's lease this tick; retry next sweep.
686
+ heldByLease.push(ws.id);
687
+ continue;
688
+ }
689
+ emitCommand(result.command);
690
+ merged.push(ws.id);
691
+ createActivityEvent({
692
+ clientId: `workspace-auto-merge-${ws.id}-${Date.now()}`,
693
+ kind: "state",
694
+ title: "Workspace auto-merging (clean fast-forward)",
695
+ body: `${ws.branch ?? ws.id} → ${p.baseRef ?? "base"} (${ahead} ahead, clean)`,
696
+ meta: ws.branch ?? ws.id,
697
+ icon: "ti-git-merge",
698
+ view: "orchestrators",
699
+ metadata: { source: "server", maintenanceJobId: "workspace-auto-merge", workspaceId: ws.id, commandId: result.command.id, ahead },
700
+ });
701
+ }
702
+
703
+ return { scanned: candidates.length, merged, heldByLease, leftForSteward, wokeStewards };
704
+ }
705
+
706
+ // Send a system DM, swallowing failures (a stale/missing/misconfigured target
707
+ // must never break the GC sweep). Returns the target on success, null otherwise.
708
+ function notifyTarget(target: string, subject: string, body: string, payload: Record<string, unknown>): string | null {
709
+ if (!target) return null;
710
+ try {
711
+ emitNewMessage(sendMessage({ from: "system", to: target, kind: "system", subject, body, payload }));
712
+ return target;
713
+ } catch {
714
+ return null;
715
+ }
414
716
  }
415
717
 
416
718
  async function workspaceGC(): Promise<Record<string, unknown>> {
@@ -418,6 +720,10 @@ async function workspaceGC(): Promise<Record<string, unknown>> {
418
720
  const cutoff = now - WORKSPACE_RETENTION_MS;
419
721
  const reviewCutoff = now - WORKSPACE_REVIEW_TTL_MS;
420
722
 
723
+ // 0. Free any merge leases whose holder never reported back (orchestrator died
724
+ // mid-merge). The lease TTL is the safety net; this just reclaims them eagerly.
725
+ const releasedLeaseRepos = releaseExpiredMergeLeases(now);
726
+
421
727
  // 1. Prune terminal rows past retention
422
728
  const all = listWorkspaces();
423
729
  const terminalIds: string[] = [];
@@ -428,29 +734,84 @@ async function workspaceGC(): Promise<Record<string, unknown>> {
428
734
  }
429
735
  }
430
736
 
431
- // 2. Auto-abandon stale review_requested worktrees
737
+ // 2. Rescue stranded review_requested/conflict worktrees (issue #157). A
738
+ // worktree is "stranded" when its steward is gone (all repo agents offline).
739
+ // Re-elect first — an agent may have rejoined — and hand off to the new
740
+ // steward; if none can be elected past the TTL, escalate to the fallback
741
+ // target so it never rots in silence. Bookkeeping uses patchWorkspaceMetadata
742
+ // (no updated_at bump) so the auto-abandon clock below keeps ticking.
743
+ const escalatedIds: string[] = [];
744
+ const reassignedIds: string[] = [];
745
+ const escalationTargets: string[] = [];
746
+ const escalationMs = stewardEscalationMs();
747
+ const fallbackTarget = stewardFallbackTarget();
748
+ for (const ws of all) {
749
+ if (!STRANDABLE_STATUSES.has(ws.status) || ws.mode !== "isolated" || !ws.worktreePath) continue;
750
+ reelectRepoSteward(ws.repoRoot);
751
+ const fresh = getWorkspace(ws.id);
752
+ if (!fresh || !STRANDABLE_STATUSES.has(fresh.status)) continue;
753
+ const meta = fresh.metadata as Record<string, unknown>;
754
+ const steward = fresh.stewardAgentId;
755
+ const stewardOnline = Boolean(steward && getAgent(steward) && getAgent(steward)!.status !== "offline");
756
+ const strandedAt = typeof meta.strandedAt === "number" ? meta.strandedAt : undefined;
757
+
758
+ if (stewardOnline) {
759
+ // An online steward owns it. If it was previously stranded and this
760
+ // steward hasn't been told, hand it off explicitly, then clear markers.
761
+ if (strandedAt !== undefined && meta.strandedNotifiedSteward !== steward) {
762
+ const sent = notifyTarget(
763
+ steward!,
764
+ "Workspace stewardship reassigned",
765
+ `You are now steward for ${fresh.repoRoot}. Workspace \`${fresh.branch ?? fresh.id}\` is ${fresh.status} and was stranded without an online steward — please coordinate ${fresh.status === "conflict" ? "conflict resolution" : "review/merge"}.`,
766
+ { kind: "workspace.steward-reassigned", workspaceId: fresh.id, repoRoot: fresh.repoRoot, branch: fresh.branch, status: fresh.status },
767
+ );
768
+ if (sent) reassignedIds.push(fresh.id);
769
+ }
770
+ patchWorkspaceMetadata(fresh.id, { strandedAt: undefined, escalatedAt: undefined, strandedNotifiedSteward: steward });
771
+ continue;
772
+ }
773
+
774
+ // Stranded: no online steward could be elected.
775
+ if (strandedAt === undefined) { patchWorkspaceMetadata(fresh.id, { strandedAt: now }); continue; }
776
+ if (now - strandedAt < escalationMs || meta.escalatedAt) continue;
777
+ const sent = notifyTarget(
778
+ fallbackTarget,
779
+ "Stranded workspace needs an owner",
780
+ `Workspace \`${fresh.branch ?? fresh.id}\` in ${fresh.repoRoot} is ${fresh.status} with no online steward (all repo agents offline) for ${Math.round((now - strandedAt) / (60 * 60 * 1000))}h. Please coordinate ${fresh.status === "conflict" ? "conflict resolution" : "review/merge"} or clean up the worktree.`,
781
+ { kind: "workspace.stranded-escalation", workspaceId: fresh.id, repoRoot: fresh.repoRoot, branch: fresh.branch, status: fresh.status, strandedAt },
782
+ );
783
+ if (sent) escalationTargets.push(sent);
784
+ patchWorkspaceMetadata(fresh.id, { escalatedAt: now });
785
+ escalatedIds.push(fresh.id);
786
+ createActivityEvent({
787
+ clientId: `workspace-gc-escalate-${fresh.id}-${now}`,
788
+ kind: "state",
789
+ title: "Workspace escalated",
790
+ body: `${fresh.branch ?? fresh.id} in ${fresh.repoRoot} — stranded ${fresh.status} escalated${fallbackTarget ? ` to ${fallbackTarget}` : " (no fallback configured)"}`,
791
+ meta: fresh.branch ?? fresh.id,
792
+ icon: "ti-alert-octagon",
793
+ view: "orchestrators",
794
+ metadata: { source: "server", maintenanceJobId: "workspace-gc", workspaceId: fresh.id, fallback: fallbackTarget || null },
795
+ });
796
+ }
797
+
798
+ // 3. Auto-abandon stale review_requested worktrees
432
799
  const abandonedIds: string[] = [];
433
800
  const notifiedStewards: string[] = [];
434
801
  for (const ws of all) {
435
802
  if (ws.status === "review_requested" && ws.updatedAt < reviewCutoff) {
436
803
  updateWorkspaceStatus(ws.id, "abandoned", { autoAbandoned: true, abandonedReason: "review_requested TTL exceeded", abandonedAt: now });
437
804
  abandonedIds.push(ws.id);
438
- if (ws.stewardAgentId) {
439
- try {
440
- const msg = sendMessage({
441
- from: "system",
442
- to: ws.stewardAgentId,
443
- kind: "system",
444
- subject: "Workspace auto-abandoned",
445
- body: `Workspace \`${ws.branch ?? ws.id}\` in ${ws.repoRoot} was auto-abandoned after ${Math.round(WORKSPACE_REVIEW_TTL_MS / DAY_MS)}d without steward action. Run workspace cleanup to reclaim the worktree.`,
446
- payload: { kind: "workspace.auto-abandoned", workspaceId: ws.id, repoRoot: ws.repoRoot, branch: ws.branch },
447
- });
448
- emitNewMessage(msg);
449
- notifiedStewards.push(ws.stewardAgentId);
450
- } catch {
451
- // Steward gone — activity event is enough.
452
- }
453
- }
805
+ // Notify the steward if one exists, else the configured fallback so a
806
+ // stranded abandon isn't silent (issue #157).
807
+ const target = ws.stewardAgentId ?? fallbackTarget;
808
+ const sent = notifyTarget(
809
+ target,
810
+ "Workspace auto-abandoned",
811
+ `Workspace \`${ws.branch ?? ws.id}\` in ${ws.repoRoot} was auto-abandoned after ${Math.round(WORKSPACE_REVIEW_TTL_MS / DAY_MS)}d without steward action. Run workspace cleanup to reclaim the worktree.`,
812
+ { kind: "workspace.auto-abandoned", workspaceId: ws.id, repoRoot: ws.repoRoot, branch: ws.branch },
813
+ );
814
+ if (sent) notifiedStewards.push(sent);
454
815
  createActivityEvent({
455
816
  clientId: `workspace-gc-abandon-${ws.id}-${now}`,
456
817
  kind: "state",
@@ -483,7 +844,16 @@ async function workspaceGC(): Promise<Record<string, unknown>> {
483
844
  pruneCommands.push(command.id);
484
845
  }
485
846
 
486
- return { prunedTerminal: terminalIds, autoAbandoned: abandonedIds, notifiedStewards, pruneCommands };
847
+ return {
848
+ prunedTerminal: terminalIds,
849
+ autoAbandoned: abandonedIds,
850
+ notifiedStewards,
851
+ pruneCommands,
852
+ releasedLeaseRepos,
853
+ escalated: escalatedIds,
854
+ reassigned: reassignedIds,
855
+ escalationTargets,
856
+ };
487
857
  }
488
858
 
489
859
  let timer: Timer | null = null;