agent-relay-server 0.27.1 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -211,9 +211,19 @@ function isPersistedRelayMessage(message: Message): boolean {
211
211
  return Number.isSafeInteger(message.id) && message.id > 0;
212
212
  }
213
213
 
214
+ // #283 — one-line nudge that replaces the reply-scaffold footer for notification-class
215
+ // (replyExpected:false) messages. Deliberately tiny so a bloated context can't drown the
216
+ // no-reply rule established at session start. Shared with the Claude delivery path.
217
+ export const NOTIFICATION_NUDGE = "↪ Notification — no reply needed.";
218
+
219
+ // A notification is a persisted message the server marked replyExpected:false.
220
+ export function isNotificationMessage(message: Message): boolean {
221
+ return isPersistedRelayMessage(message) && message.replyExpected === false;
222
+ }
223
+
214
224
  function latestReplyableMessage(messages: Message[]): Message | undefined {
215
225
  return messages
216
- .filter((message) => isPersistedRelayMessage(message) && !isMemoryInjection(message) && !isReactionNotification(message))
226
+ .filter((message) => isPersistedRelayMessage(message) && !isMemoryInjection(message) && !isReactionNotification(message) && message.replyExpected !== false)
217
227
  .at(-1);
218
228
  }
219
229
 
@@ -316,6 +326,9 @@ export function providerMessageText(messages: Message[]): string {
316
326
  "If you already delivered the useful response through Relay, do not send a separate status-only confirmation.",
317
327
  "If multiple messages arrived together, cover them in one reply instead of answering each line separately.",
318
328
  ].join("\n"));
329
+ } else if (messages.some(isNotificationMessage)) {
330
+ // #283 — pure notification batch: no scaffold, just the one-line no-reply nudge.
331
+ sections.push(NOTIFICATION_NUDGE);
319
332
  }
320
333
  return sections.join("\n\n");
321
334
  }
@@ -78,6 +78,7 @@ export function notifyBranchLanded(input: BranchLandedInput): void {
78
78
  subject: "Your branch landed",
79
79
  body: `✅ ${branchLabel} landed on \`${base}\`${shaLabel}${subjectLabel}.${continueLabel}`,
80
80
  payload,
81
+ replyExpected: false,
81
82
  });
82
83
  }
83
84
 
@@ -93,6 +94,7 @@ export function notifyBranchLanded(input: BranchLandedInput): void {
93
94
  subject: `Merged to ${base}`,
94
95
  body: `🔀 ${branchLabel}${authorLabel} merged to \`${base}\`${shaLabel}${subjectLabel}.`,
95
96
  payload,
97
+ replyExpected: false,
96
98
  });
97
99
  }
98
100
  }
package/src/db.ts CHANGED
@@ -226,6 +226,7 @@ export function initDb(path: string = "agent-relay.db"): Database {
226
226
  body TEXT NOT NULL,
227
227
  thread_id INTEGER,
228
228
  reply_to INTEGER REFERENCES messages(id),
229
+ reply_expected INTEGER NOT NULL DEFAULT 1,
229
230
  claimable INTEGER NOT NULL DEFAULT 0,
230
231
  claimed_by TEXT,
231
232
  claimed_at INTEGER,
@@ -857,6 +858,9 @@ export function initDb(path: string = "agent-relay.db"): Database {
857
858
  db.run("ALTER TABLE messages ADD COLUMN thread_id INTEGER");
858
859
  db.run("ALTER TABLE messages ADD COLUMN reply_to INTEGER REFERENCES messages(id)");
859
860
  }
861
+ if (!colNames.includes("reply_expected")) {
862
+ db.run("ALTER TABLE messages ADD COLUMN reply_expected INTEGER NOT NULL DEFAULT 1");
863
+ }
860
864
  if (!colNames.includes("claimable")) {
861
865
  db.run("ALTER TABLE messages ADD COLUMN claimable INTEGER NOT NULL DEFAULT 0");
862
866
  db.run("ALTER TABLE messages ADD COLUMN claimed_by TEXT");
@@ -1292,6 +1296,9 @@ function rowToMessage(row: any): Message {
1292
1296
  body: row.body,
1293
1297
  threadId: row.thread_id ?? undefined,
1294
1298
  replyTo: row.reply_to ?? undefined,
1299
+ // Default (true) stays absent to match the `claimable` idiom and keep notification-free
1300
+ // messages byte-identical on the wire; only an explicit notification surfaces false (#283).
1301
+ replyExpected: row.reply_expected === 0 ? false : undefined,
1295
1302
  claimable: row.claimable === 1 ? true : undefined,
1296
1303
  claimedBy: row.claimed_by ?? undefined,
1297
1304
  claimedAt: row.claimed_at ?? undefined,
@@ -3794,12 +3801,12 @@ export function sendMessageWithResult(input: SendMessageInput): { message: Messa
3794
3801
 
3795
3802
  const insert = db.query(`
3796
3803
  INSERT INTO messages (
3797
- from_agent, to_target, kind, channel, subject, body, thread_id, reply_to, claimable,
3804
+ from_agent, to_target, kind, channel, subject, body, thread_id, reply_to, reply_expected, claimable,
3798
3805
  idempotency_key, delivery_status, queued_at, max_age_seconds, resolved_to_agent,
3799
3806
  payload, meta, created_at, occurred_at
3800
3807
  )
3801
3808
  VALUES (
3802
- $from, $to, $kind, $channel, $subject, $body, $threadId, $replyTo, $claimable,
3809
+ $from, $to, $kind, $channel, $subject, $body, $threadId, $replyTo, $replyExpected, $claimable,
3803
3810
  $idempotencyKey, $deliveryStatus, $queuedAt, $maxAgeSeconds, $resolvedToAgent,
3804
3811
  $payload, $meta, $now, $occurredAt
3805
3812
  )
@@ -3833,6 +3840,9 @@ export function sendMessageWithResult(input: SendMessageInput): { message: Messa
3833
3840
  $body: input.body,
3834
3841
  $threadId: threadId,
3835
3842
  $replyTo: input.replyTo ?? null,
3843
+ // Server-owned reply obligation (#283): true by default; only an explicit false marks
3844
+ // a notification. Stored 0/1 so the footer renderer + reply tracker key off one column.
3845
+ $replyExpected: input.replyExpected === false ? 0 : 1,
3836
3846
  $claimable: claimable ? 1 : 0,
3837
3847
  $idempotencyKey: input.idempotencyKey ?? null,
3838
3848
  $deliveryStatus: deliveryStatus,
@@ -4318,6 +4328,9 @@ export function pollMessages(query: PollQuery): Message[] {
4318
4328
  }
4319
4329
 
4320
4330
  function messageRequiresReply(message: Message): boolean {
4331
+ // Server-owned notification flag (#283) wins over every kind/sender heuristic below: an
4332
+ // explicit replyExpected:false is a fire-and-forget message that must never become an obligation.
4333
+ if (message.replyExpected === false) return false;
4321
4334
  if (message.kind === "system" || message.kind === "control" || message.kind === "session") return false;
4322
4335
  if (message.from === "user") return true;
4323
4336
  if (message.kind === "task" || message.kind === "channel.event") return true;
package/src/mcp.ts CHANGED
@@ -247,19 +247,19 @@ const TOOLS: ToolDefinition[] = [
247
247
  },
248
248
  {
249
249
  name: "relay_spawn_agent",
250
- description: "Spawn a long-living provider agent through Relay's orchestrator. Gated: requires the command:spawn scope, granted only to agents whose profile sets maxSpawnedAgents>0, up to that live-children quota. Spawned agents cannot themselves spawn (no grandchildren).",
250
+ description: "Spawn a long-living provider agent through Relay's orchestrator, optionally handing it its first task via `prompt` in the same call. Defaults to your own host (override with orchestratorId) and returns the resolved agent id once it registers. Gated: requires the command:spawn scope, granted only to agents whose profile sets maxSpawnedAgents>0, up to that live-children quota. Spawned agents cannot themselves spawn (no grandchildren).",
251
251
  requiredScopes: ["command:spawn"],
252
252
  inputSchema: {
253
253
  type: "object",
254
254
  properties: {
255
255
  provider: { type: "string", enum: SPAWN_PROVIDERS },
256
- orchestratorId: { type: "string" },
257
- cwd: { type: "string" },
256
+ orchestratorId: { type: "string", description: "Target host. Defaults to the host that owns cwd, else YOUR OWN host — only set it to spawn onto a different machine." },
257
+ cwd: { type: "string", description: "Working directory for the agent. Must resolve within the target orchestrator's base directory (enforced server-side)." },
258
258
  label: { type: "string" },
259
259
  model: { type: "string" },
260
260
  effort: { type: "string", enum: VALID_EFFORTS },
261
261
  approvalMode: { type: "string", enum: APPROVAL_MODES },
262
- prompt: { type: "string" },
262
+ prompt: { type: "string", description: "Initial task/message delivered to the agent on launch — spawn and hand it its first instruction in one call (no separate follow-up message needed)." },
263
263
  systemPromptAppend: { type: "string" },
264
264
  profile: { type: "string", description: "Agent profile name to apply (env, instructions, permissions, MCP/skills, spawn quota)." },
265
265
  tags: { type: "array", items: { type: "string" } },
@@ -267,6 +267,7 @@ const TOOLS: ToolDefinition[] = [
267
267
  providerArgs: { type: "array", items: { type: "string" } },
268
268
  policyName: { type: "string" },
269
269
  spawnRequestId: { type: "string" },
270
+ waitForRegistrationMs: { type: "integer", minimum: 0, maximum: 30000, description: "How long to wait for the spawned agent to register before returning, so the response carries its resolved agent id (default 8000; 0 = return immediately with just spawnRequestId)." },
270
271
  },
271
272
  required: ["provider"],
272
273
  additionalProperties: false,
@@ -485,7 +486,7 @@ async function callTool(auth: McpAuthContext, params: unknown): Promise<Record<s
485
486
  else if (name === "relay_agent_status") result = relayAgentStatus(args);
486
487
  else if (name === "relay_find_agents") result = relayFindAgents(auth, args);
487
488
  else if (name === "relay_whoami") result = relayWhoami(auth);
488
- else if (name === "relay_spawn_agent") result = relaySpawnAgent(auth, args);
489
+ else if (name === "relay_spawn_agent") result = await relaySpawnAgent(auth, args);
489
490
  else if (name === "relay_shutdown_agent") result = relayShutdownAgent(auth, args);
490
491
  else if (name === "relay_workspace_status") result = await relayWorkspaceStatus(auth, args);
491
492
  else if (name === "relay_workspace_list") result = relayWorkspaceList(auth, args);
@@ -763,10 +764,12 @@ function relayFindAgents(auth: McpAuthContext, args: Record<string, unknown>): R
763
764
  return { agents, count: agents.length };
764
765
  }
765
766
 
766
- function relaySpawnAgent(auth: McpAuthContext, args: Record<string, unknown>): Record<string, unknown> {
767
+ async function relaySpawnAgent(auth: McpAuthContext, args: Record<string, unknown>): Promise<Record<string, unknown>> {
767
768
  const provider = enumField(args.provider, "provider", SPAWN_PROVIDERS) as SpawnProvider;
768
769
  const cwd = optionalString(args.cwd, "cwd", 500);
769
- const orchestrator = selectSpawnOrchestrator(provider, optionalString(args.orchestratorId, "orchestratorId", 200), cwd);
770
+ const callerId = callerAgentId(auth);
771
+ const preferHost = callerId ? getAgent(callerId)?.machine : undefined;
772
+ const orchestrator = selectSpawnOrchestrator(provider, optionalString(args.orchestratorId, "orchestratorId", 200), cwd, preferHost);
770
773
  const resolvedCwd = cwd || orchestrator.baseDir;
771
774
  if (cwd && !isPathWithinBase(cwd, orchestrator.baseDir)) {
772
775
  throw new ValidationError(`cwd must be within orchestrator base directory: ${orchestrator.baseDir}`);
@@ -781,7 +784,6 @@ function relaySpawnAgent(auth: McpAuthContext, args: Record<string, unknown>): R
781
784
  // #221 runtime gate (belt; the coarse `command:spawn` scope is enforced in callTool, and is
782
785
  // granted only to agents whose profile sets maxSpawnedAgents>0 and never to children).
783
786
  // Server/admin tokens have no caller identity → unrestricted by design.
784
- const callerId = callerAgentId(auth);
785
787
  if (callerId) {
786
788
  const me = getAgent(callerId);
787
789
  if (me?.spawnedBy) {
@@ -841,7 +843,27 @@ function relaySpawnAgent(auth: McpAuthContext, args: Record<string, unknown>): R
841
843
  }),
842
844
  });
843
845
  emitCommand(command);
844
- return { ok: true, orchestratorId: orchestrator.id, provider, command };
846
+
847
+ // #255: resolve the spawned agent id once it registers. Spawn is a fire-and-forget command
848
+ // over the bus; the child registers back to THIS relay (same DB) with meta.spawnRequestId set,
849
+ // so a bounded poll links the request to the agent without a separate relay_find_agents round
850
+ // trip. waitForRegistrationMs:0 opts out (pure fire-and-forget); the default is short because
851
+ // isolated-worktree spawns register near-instantly (symlinked deps).
852
+ const waitMs = Math.min(optionalNonNegativeInt(args.waitForRegistrationMs, "waitForRegistrationMs") ?? 8000, 30000);
853
+ const agentId = waitMs > 0 ? await waitForSpawnedAgent(spawnRequestId, waitMs) : null;
854
+ return { ok: true, spawnRequestId, orchestratorId: orchestrator.id, provider, agentId, registered: agentId !== null, command };
855
+ }
856
+
857
+ // Poll the agents table for the child that registers with this spawnRequestId (#255). Returns
858
+ // the resolved agent id, or null on timeout (the caller still has spawnRequestId to poll later).
859
+ async function waitForSpawnedAgent(spawnRequestId: string, timeoutMs: number, pollMs = 300): Promise<string | null> {
860
+ const deadline = Date.now() + timeoutMs;
861
+ for (;;) {
862
+ const match = listAgents().find((a) => a.meta?.spawnRequestId === spawnRequestId);
863
+ if (match) return match.id;
864
+ if (Date.now() >= deadline) return null;
865
+ await new Promise<void>((resolve) => setTimeout(resolve, Math.min(pollMs, Math.max(0, deadline - Date.now()))));
866
+ }
845
867
  }
846
868
 
847
869
  function relayShutdownAgent(auth: McpAuthContext, args: Record<string, unknown>): Record<string, unknown> {
@@ -1062,7 +1084,12 @@ function policyStatusPayload(policy: NonNullable<ReturnType<typeof getSpawnPolic
1062
1084
  };
1063
1085
  }
1064
1086
 
1065
- function selectSpawnOrchestrator(provider: SpawnProvider, orchestratorId?: string, cwd?: string): NonNullable<ReturnType<typeof getOrchestrator>> {
1087
+ function selectSpawnOrchestrator(
1088
+ provider: SpawnProvider,
1089
+ orchestratorId?: string,
1090
+ cwd?: string,
1091
+ preferHost?: string,
1092
+ ): NonNullable<ReturnType<typeof getOrchestrator>> {
1066
1093
  if (orchestratorId) {
1067
1094
  const orchestrator = getOrchestrator(orchestratorId);
1068
1095
  if (!orchestrator) throw new McpNotFoundError(`orchestrator ${orchestratorId} not found`);
@@ -1075,6 +1102,14 @@ function selectSpawnOrchestrator(provider: SpawnProvider, orchestratorId?: strin
1075
1102
  const match = candidates.find((item) => isPathWithinBase(cwd, item.baseDir));
1076
1103
  if (match) return match;
1077
1104
  }
1105
+ // #255: with neither an explicit id nor a cwd to pin the host, default to the CALLER's own
1106
+ // host instead of silently grabbing candidates[0] (a foreign host whose baseDir would then
1107
+ // reject the caller's cwd — the footgun the spawn recipe warned about). An agent's `machine`
1108
+ // is its OS hostname; match it against the orchestrator hostname (or id, defensively).
1109
+ if (preferHost) {
1110
+ const own = candidates.find((item) => item.hostname === preferHost || item.id === preferHost);
1111
+ if (own) return own;
1112
+ }
1078
1113
  const orchestrator = candidates[0];
1079
1114
  if (!orchestrator) throw new McpNotFoundError(`no orchestrator available for provider: ${provider}`);
1080
1115
  return orchestrator;
@@ -1327,6 +1362,12 @@ function optionalPositiveInt(value: unknown, field: string): number | undefined
1327
1362
  return value;
1328
1363
  }
1329
1364
 
1365
+ function optionalNonNegativeInt(value: unknown, field: string): number | undefined {
1366
+ if (value === undefined || value === null) return undefined;
1367
+ if (typeof value !== "number" || !Number.isSafeInteger(value) || value < 0) throw new ValidationError(`${field} must be a non-negative integer`);
1368
+ return value;
1369
+ }
1370
+
1330
1371
  function optionalFutureTimestamp(value: unknown, field: string): number | undefined {
1331
1372
  const timestamp = optionalPositiveInt(value, field);
1332
1373
  if (timestamp !== undefined && timestamp <= Date.now()) throw new ValidationError(`${field} must be a future unix timestamp in milliseconds`);
package/src/notify.ts CHANGED
@@ -10,6 +10,13 @@ export interface SystemNotifyOptions {
10
10
  kind?: MessageKind;
11
11
  /** Sender id; defaults to "system". */
12
12
  from?: string;
13
+ /**
14
+ * #283 — set false for a fire-and-forget notification (merge notice, lifecycle event): the
15
+ * server suppresses the reply-scaffold footer and the reply-obligation tracker skips it.
16
+ * Omit (default true) for system messages that genuinely want the agent to act/answer
17
+ * (steward task assignments, conflict handoffs).
18
+ */
19
+ replyExpected?: boolean;
13
20
  }
14
21
 
15
22
  /**
@@ -25,6 +32,7 @@ export function notifySystemMessage(to: string, opts: SystemNotifyOptions): Mess
25
32
  subject: opts.subject,
26
33
  body: opts.body,
27
34
  payload: opts.payload,
35
+ replyExpected: opts.replyExpected,
28
36
  });
29
37
  emitNewMessage(msg);
30
38
  return msg;
package/src/routes.ts CHANGED
@@ -520,6 +520,9 @@ function normalizeMessageInput(body: unknown): SendMessageInput {
520
520
  if (body.claimable !== undefined && typeof body.claimable !== "boolean") {
521
521
  throw new ValidationError("claimable must be a boolean");
522
522
  }
523
+ if (body.replyExpected !== undefined && typeof body.replyExpected !== "boolean") {
524
+ throw new ValidationError("replyExpected must be a boolean");
525
+ }
523
526
 
524
527
  const input: SendMessageInput = {
525
528
  from: cleanString(body.from, "from", { required: true, max: 200 })!,
@@ -527,6 +530,7 @@ function normalizeMessageInput(body: unknown): SendMessageInput {
527
530
  body: cleanString(body.body, "body", { required: true, max: MAX_BODY_BYTES })!,
528
531
  kind: kind as SendMessageInput["kind"] | undefined,
529
532
  replyTo: cleanPositiveId(body.replyTo, "replyTo"),
533
+ replyExpected: body.replyExpected as boolean | undefined,
530
534
  claimable: body.claimable as boolean | undefined,
531
535
  idempotencyKey: cleanString(body.idempotencyKey, "idempotencyKey", { max: 240 }),
532
536
  };
@@ -5,11 +5,15 @@
5
5
  // visible to the agent or the dashboard, so unlanded work can sit stranded for
6
6
  // weeks (one real casualty: a CI-guard test, recovered by hand).
7
7
  //
8
- // THE invariant (single home, see `worktreeReapable`): reaping is gated on
9
- // "nothing would be lost" landed or empty NEVER on session liveness or a
10
- // timer. A worktree holding un-landed commits is flagged for attention, never
11
- // force-removed. This module is the disk⇄DB reconciler the GC's `git worktree
12
- // prune` (a no-op while the directory exists) never was.
8
+ // THE invariant (single home, see `worktreeReapable`): a worktree is reaped only
9
+ // when BOTH hold — "nothing would be lost" (landed or empty; un-landed commits are
10
+ // flagged, never force-removed) AND "nobody is using it" (the owner is dead, and no
11
+ // live row claims the path). #278 proved the first half alone is not enough: a
12
+ // post-land recycled worktree (ahead 0) owned by a LIVE session is reap-safe by
13
+ // land-state, yet destroying it kills that session's toolchain mid-flight. Path-keyed
14
+ // row matching (never repoRoot-scoped) plus an owner-liveness guard enforce the
15
+ // second half. This module is the disk⇄DB reconciler the GC's `git worktree prune`
16
+ // (a no-op while the directory exists) never was.
13
17
 
14
18
  import { resolve } from "node:path";
15
19
  import { RELAY_TOKEN_HEADER } from "agent-relay-sdk";
@@ -20,6 +24,7 @@ import { emitRelayEvent } from "./events";
20
24
  import { isPathWithinBase } from "./utils";
21
25
  import { TERMINAL_WORKSPACE_STATUSES, worktreeReapable, type WorktreeReapState } from "./workspace-phase";
22
26
  import { isOwnerAlive } from "./workspace-merge";
27
+ import { applyWorkspaceAction } from "./workspace-actions";
23
28
 
24
29
  // Don't re-flag the same un-landed orphan every sweep — surface it once, then
25
30
  // stay quiet for this window. In-memory (keyed by worktree path) like the
@@ -29,10 +34,21 @@ const UNLANDED_FLAG_COOLDOWN_MS = Number(process.env.AGENT_RELAY_ORPHAN_FLAG_COO
29
34
  // remove them (parity with the session reaper's detect-only switch).
30
35
  const orphanWorktreeReapEnabled = () => process.env.AGENT_RELAY_ORPHAN_WORKTREE_REAP !== "0";
31
36
  const flaggedAt = new Map<string, number>();
32
- const IN_FLIGHT_MISSING_WORKTREE_STATUSES = new Set<WorkspaceStatus>(["merge_planned", "cleanup_requested"]);
37
+ const IN_FLIGHT_WORKSPACE_STATUSES = new Set<WorkspaceStatus>(["merge_planned", "cleanup_requested"]);
38
+
39
+ // #279 dead-owner grace window: a tracked worktree whose owner has died is only
40
+ // reaped after the owner is observed dead continuously for this window, so a
41
+ // reconnecting agent isn't raced. Keyed by resolved worktree path; in-memory like
42
+ // the session reaper's tracker (a restart re-arms the clock — conservative).
43
+ const orphanGraceMs = (): number => {
44
+ const v = Number(process.env.AGENT_RELAY_ORPHAN_GRACE_MS);
45
+ return Number.isFinite(v) && v >= 0 ? v : 30 * 60 * 1000;
46
+ };
47
+ const deadOwnerTracker = new Map<string, { firstSeenDeadAt: number }>();
33
48
 
34
49
  export function resetOrphanWorktreeStateForTests(): void {
35
50
  flaggedAt.clear();
51
+ deadOwnerTracker.clear();
36
52
  }
37
53
 
38
54
  interface OnlineOrchestrator {
@@ -138,6 +154,17 @@ export interface CollectOrphansResult {
138
154
  baseSha?: string;
139
155
  ownerAgentId?: string;
140
156
  }>;
157
+ /** Non-terminal isolated rows whose worktree IS present on disk (#279). Tracked,
158
+ * not orphaned — the reaper decides on owner-liveness + grace + land-state. */
159
+ deadOwnerWorkspaces: Array<{
160
+ workspaceId: string;
161
+ worktreePath: string;
162
+ repoRoot: string;
163
+ branch?: string;
164
+ baseRef?: string;
165
+ baseSha?: string;
166
+ ownerAgentId?: string;
167
+ }>;
141
168
  reason?: string;
142
169
  }
143
170
 
@@ -150,11 +177,27 @@ export interface CollectOrphansResult {
150
177
  */
151
178
  export async function collectWorkspaceOrphans(): Promise<CollectOrphansResult> {
152
179
  const orchestrators = onlineOrchestrators();
153
- if (!orchestrators.length) return { orphans: [], missingWorktrees: [], reason: "no online orchestrators" };
180
+ if (!orchestrators.length) return { orphans: [], missingWorktrees: [], deadOwnerWorkspaces: [], reason: "no online orchestrators" };
154
181
 
155
182
  const all = listWorkspaces();
156
183
  const orphans: WorkspaceOrphan[] = [];
157
184
  const missingWorktrees: CollectOrphansResult["missingWorktrees"] = [];
185
+ const deadOwnerWorkspaces: CollectOrphansResult["deadOwnerWorkspaces"] = [];
186
+
187
+ // Worktree paths are globally unique, so a row from ANY repoRoot that records a
188
+ // path is the authoritative claim on it. Match disk→DB across ALL rows, NEVER
189
+ // scoped to one repoRoot: the scoped version false-orphaned chained workspaces
190
+ // (managed session → base checkout → isolated worktree, whose row records the base
191
+ // checkout as repoRoot, not the main repo the probe is seeded from), so the reaper
192
+ // destroyed a live session's worktree (#278).
193
+ const rowsByPath = new Map(all.filter((ws) => ws.worktreePath).map((ws) => [resolve(ws.worktreePath), ws]));
194
+
195
+ // Union of every probed repo's on-disk worktrees + the repoRoots we actually
196
+ // reached. The DB→disk (missing-worktree) pass runs ONCE, globally, after the loop
197
+ // against these — per-repo scoping there falsely flagged chained rows as missing
198
+ // (their worktree lives under a different repo's probe than the one iterating). #278
199
+ const onDiskAll = new Set<string>();
200
+ const probedRepoRoots = new Set<string>();
158
201
 
159
202
  for (const repoRoot of knownRepoRoots(all)) {
160
203
  const orch = orchestrators.find((candidate) => candidate.apiUrl && isPathWithinBase(repoRoot, candidate.baseDir));
@@ -162,40 +205,32 @@ export async function collectWorkspaceOrphans(): Promise<CollectOrphansResult> {
162
205
  const probe = await fetchHostProbe(orch.apiUrl, repoRoot);
163
206
  if (!probe?.worktrees) continue;
164
207
 
165
- const liveRowsByPath = new Map(
166
- all
167
- .filter((ws) => ws.repoRoot === repoRoot && ws.worktreePath && !TERMINAL_WORKSPACE_STATUSES.has(ws.status))
168
- .map((ws) => [resolve(ws.worktreePath), ws]),
169
- );
170
- const onDisk = new Set(probe.worktrees.map((wt) => (wt.path ? resolve(wt.path) : "")).filter(Boolean));
171
-
172
- // DB→disk drift: a live isolated row whose worktree is no longer on disk.
173
- for (const [path, ws] of liveRowsByPath) {
174
- if (ws.mode === "isolated" && !onDisk.has(path)) {
175
- missingWorktrees.push({
176
- workspaceId: ws.id,
177
- worktreePath: ws.worktreePath,
178
- repoRoot,
179
- status: ws.status,
180
- branch: ws.branch,
181
- baseRef: ws.baseRef,
182
- baseSha: ws.baseSha,
183
- ownerAgentId: ws.ownerAgentId,
184
- });
185
- }
186
- }
208
+ probedRepoRoots.add(resolve(repoRoot));
209
+ for (const wt of probe.worktrees) if (wt.path) onDiskAll.add(resolve(wt.path));
187
210
 
188
- // disk→DB drift: a worktree on disk with no live row.
189
- const rowsByPath = new Map(
190
- all.filter((ws) => ws.repoRoot === repoRoot && ws.worktreePath).map((ws) => [resolve(ws.worktreePath), ws]),
191
- );
211
+ // disk→DB drift: a worktree on disk with no live row → orphan. A worktree with a
212
+ // live isolated row is NOT an orphan, but if its owner has died it's the #279
213
+ // dead-owner case surface it for the reaper to evaluate (liveness + grace).
192
214
  for (const worktree of probe.worktrees) {
193
215
  if (!worktree.path || resolve(worktree.path) === resolve(repoRoot)) continue;
194
216
  // Only agent-relay-created worktrees (agent/* branches) are reclaimable —
195
217
  // never touch a user's own linked worktrees.
196
218
  if (!worktree.branch?.startsWith("agent/")) continue;
197
219
  const row = rowsByPath.get(resolve(worktree.path));
198
- if (row && !TERMINAL_WORKSPACE_STATUSES.has(row.status)) continue; // tracked & live
220
+ if (row && !TERMINAL_WORKSPACE_STATUSES.has(row.status)) {
221
+ if (row.mode === "isolated") {
222
+ deadOwnerWorkspaces.push({
223
+ workspaceId: row.id,
224
+ worktreePath: worktree.path,
225
+ repoRoot: row.repoRoot,
226
+ branch: row.branch ?? worktree.branch,
227
+ baseRef: row.baseRef,
228
+ baseSha: row.baseSha,
229
+ ownerAgentId: row.ownerAgentId,
230
+ });
231
+ }
232
+ continue; // tracked & live — not an orphan
233
+ }
199
234
 
200
235
  const orphan: WorkspaceOrphan = {
201
236
  worktreePath: worktree.path,
@@ -223,7 +258,32 @@ export async function collectWorkspaceOrphans(): Promise<CollectOrphansResult> {
223
258
  }
224
259
  }
225
260
 
226
- return { orphans, missingWorktrees };
261
+ // DB→disk drift: a non-terminal isolated row whose worktree is gone from disk. Run
262
+ // once, globally, against the union of probed repos. Flag "missing" only when the
263
+ // path is absent from EVERY probe AND the row was COVERABLE — its repoRoot was
264
+ // probed, OR its repoRoot is itself a worktree we saw on disk (the chained case: an
265
+ // isolated row's repoRoot is its session base checkout, a linked worktree of a
266
+ // probed repo). The coverability gate is what stops rows under an un-probeable host
267
+ // from being falsely flagged missing. #278
268
+ for (const ws of all) {
269
+ if (ws.mode !== "isolated" || !ws.worktreePath) continue;
270
+ if (TERMINAL_WORKSPACE_STATUSES.has(ws.status)) continue;
271
+ if (onDiskAll.has(resolve(ws.worktreePath))) continue;
272
+ const coverable = probedRepoRoots.has(resolve(ws.repoRoot)) || onDiskAll.has(resolve(ws.repoRoot));
273
+ if (!coverable) continue;
274
+ missingWorktrees.push({
275
+ workspaceId: ws.id,
276
+ worktreePath: ws.worktreePath,
277
+ repoRoot: ws.repoRoot,
278
+ status: ws.status,
279
+ branch: ws.branch,
280
+ baseRef: ws.baseRef,
281
+ baseSha: ws.baseSha,
282
+ ownerAgentId: ws.ownerAgentId,
283
+ });
284
+ }
285
+
286
+ return { orphans, missingWorktrees, deadOwnerWorkspaces };
227
287
  }
228
288
 
229
289
  function dispatchCleanup(orch: OnlineOrchestrator, orphan: WorkspaceOrphan): string {
@@ -254,7 +314,7 @@ function dispatchCleanup(orch: OnlineOrchestrator, orphan: WorkspaceOrphan): str
254
314
  * directions surface. Never removes on uncertainty.
255
315
  */
256
316
  export async function reapOrphanedWorktrees(): Promise<Record<string, unknown>> {
257
- const { orphans, missingWorktrees, reason } = await collectWorkspaceOrphans();
317
+ const { orphans, missingWorktrees, deadOwnerWorkspaces, reason } = await collectWorkspaceOrphans();
258
318
  if (reason) return { skipped: reason };
259
319
 
260
320
  const orchestrators = onlineOrchestrators();
@@ -263,14 +323,27 @@ export async function reapOrphanedWorktrees(): Promise<Record<string, unknown>>
263
323
  const flagged: string[] = [];
264
324
  const autoAbandoned: string[] = [];
265
325
  const flaggedMissingWorktrees: string[] = [];
326
+ const deadOwnerReaped: string[] = [];
327
+ const deadOwnerFlagged: string[] = [];
266
328
  const now = Date.now();
267
329
 
330
+ // Defense in depth (#278): the matching fix in collectWorkspaceOrphans should keep
331
+ // any path with a live, non-terminal row out of `orphans`. This is the belt to that
332
+ // suspenders — re-read rows fresh and never reap a path one still claims, even if a
333
+ // future refactor reintroduces a lookup gap. False-reaping a live session is fatal.
334
+ const nonTerminalRowsByPath = new Map(
335
+ listWorkspaces()
336
+ .filter((ws) => ws.worktreePath && !TERMINAL_WORKSPACE_STATUSES.has(ws.status))
337
+ .map((ws) => [resolve(ws.worktreePath), ws]),
338
+ );
339
+
268
340
  for (const orphan of orphans) {
269
341
  const orch = orchestrators.find((candidate) => isPathWithinBase(orphan.repoRoot, candidate.baseDir));
270
342
  if (!orch) continue;
271
343
 
272
344
  if (orphan.safeToReap === true) {
273
345
  if (!reapEnabled) continue; // detect-only mode
346
+ if (nonTerminalRowsByPath.has(resolve(orphan.worktreePath))) continue; // still claimed — never reap (#278)
274
347
  const commandId = dispatchCleanup(orch, orphan);
275
348
  reaped.push(orphan.worktreePath);
276
349
  flaggedAt.delete(orphan.worktreePath);
@@ -315,7 +388,7 @@ export async function reapOrphanedWorktrees(): Promise<Record<string, unknown>>
315
388
  if (!workspace || TERMINAL_WORKSPACE_STATUSES.has(workspace.status) || workspace.mode !== "isolated" || !workspace.worktreePath) continue;
316
389
  const key = `missing:${workspace.worktreePath}`;
317
390
  const ownerAlive = isOwnerAlive(workspace.ownerAgentId);
318
- const inFlight = IN_FLIGHT_MISSING_WORKTREE_STATUSES.has(workspace.status);
391
+ const inFlight = IN_FLIGHT_WORKSPACE_STATUSES.has(workspace.status);
319
392
  const last = flaggedAt.get(key) ?? 0;
320
393
  if (ownerAlive || inFlight) {
321
394
  if (now - last < UNLANDED_FLAG_COOLDOWN_MS) continue;
@@ -410,9 +483,95 @@ export async function reapOrphanedWorktrees(): Promise<Record<string, unknown>>
410
483
  });
411
484
  }
412
485
 
486
+ // #279: a tracked worktree whose owner died — the common stale case the reaper
487
+ // never handled (inverse of #278's false kill). Gate on owner-liveness + a grace
488
+ // window + "nothing would be lost", then dispatch through the GUARDED action (the
489
+ // #254 isOwnerAlive re-check at dispatch is the safety net against a stale read).
490
+ const graceMs = orphanGraceMs();
491
+ const liveDeadOwnerKeys = new Set<string>();
492
+ for (const cand of deadOwnerWorkspaces) {
493
+ const workspace = getWorkspace(cand.workspaceId);
494
+ if (!workspace || workspace.mode !== "isolated" || !workspace.worktreePath) continue;
495
+ if (TERMINAL_WORKSPACE_STATUSES.has(workspace.status)) continue;
496
+ if (IN_FLIGHT_WORKSPACE_STATUSES.has(workspace.status)) continue; // cleanup/merge already dispatched
497
+ const key = `dead-owner:${resolve(workspace.worktreePath)}`;
498
+ liveDeadOwnerKeys.add(key);
499
+
500
+ if (isOwnerAlive(workspace.ownerAgentId)) {
501
+ deadOwnerTracker.delete(key); // owner alive — reset the grace window (regression vs #278)
502
+ continue;
503
+ }
504
+
505
+ // Owner observed dead. Require continuous dead-ness for the grace window so a
506
+ // reconnecting agent isn't raced; the in-memory tracker re-arms on restart. A
507
+ // grace of 0 reaps on the first dead observation (the env's escape hatch).
508
+ const tracked = deadOwnerTracker.get(key);
509
+ const firstSeenDeadAt = tracked?.firstSeenDeadAt ?? now;
510
+ if (!tracked) deadOwnerTracker.set(key, { firstSeenDeadAt });
511
+ if (now - firstSeenDeadAt < graceMs) continue;
512
+
513
+ // Grace elapsed + owner dead. Gate on land-state — "nothing would be lost".
514
+ const orch = orchestrators.find((candidate) => candidate.apiUrl && isPathWithinBase(workspace.repoRoot, candidate.baseDir));
515
+ const preview = orch?.apiUrl
516
+ ? await fetchWorktreeReapState(orch.apiUrl, workspace.worktreePath, workspace.baseRef)
517
+ : null;
518
+ const safeToReap = preview && !preview.missing && !preview.error
519
+ ? worktreeReapable({ landed: preview.landed, ahead: preview.ahead, unmergedAhead: preview.unmergedAhead, dirtyCount: preview.dirtyCount })
520
+ : undefined;
521
+
522
+ if (safeToReap === true) {
523
+ if (!reapEnabled) continue; // detect-only mode
524
+ // Route through the guarded action (#279): no `force`, so a flipped-alive owner
525
+ // is rejected (409) — our safety net. Also does the cleanup_requested transition
526
+ // + audit. Attributed to a distinct requester so incidents are traceable.
527
+ const result = applyWorkspaceAction(workspace, {
528
+ action: "cleanup",
529
+ agentId: "workspace-dead-owner-reaper",
530
+ auditMetadata: { source: "server", maintenanceJobId: "workspace-orphan-reaper", requestedBy: "workspace-dead-owner-reaper", deadOwner: workspace.ownerAgentId },
531
+ });
532
+ if (!result.ok) continue; // guard tripped (owner came back) or no owning orchestrator — re-evaluate next sweep
533
+ if (result.command) {
534
+ emitRelayEvent({ type: `command.${result.command.status}`, source: result.command.source, subject: result.command.id, data: { command: result.command } });
535
+ }
536
+ deadOwnerReaped.push(workspace.id);
537
+ deadOwnerTracker.delete(key);
538
+ continue;
539
+ }
540
+
541
+ // Un-landed or un-probeable: flag as stranded (dead owner), never remove.
542
+ const flagKey = `dead-owner-stranded:${resolve(workspace.worktreePath)}`;
543
+ liveDeadOwnerKeys.add(flagKey);
544
+ const last = flaggedAt.get(flagKey) ?? 0;
545
+ if (now - last < UNLANDED_FLAG_COOLDOWN_MS) continue;
546
+ flaggedAt.set(flagKey, now);
547
+ deadOwnerFlagged.push(workspace.id);
548
+ const detail = safeToReap === undefined
549
+ ? "host could not be probed for land-state"
550
+ : (preview && (preview.dirtyCount ?? 0) > 0)
551
+ ? "uncommitted changes in the worktree"
552
+ : `${preview?.unmergedAhead ?? preview?.ahead ?? "?"} un-landed commit(s)`;
553
+ createActivityEvent({
554
+ clientId: `workspace-dead-owner-stranded-${workspace.id}-${now}`,
555
+ kind: "state",
556
+ title: "Dead-owner worktree needs attention",
557
+ body: `${workspace.branch ?? workspace.id} in ${workspace.repoRoot} — owning agent ${workspace.ownerAgentId ?? "?"} is gone and the worktree holds work that hasn't landed (${detail}). Recover the commits, then clean it up explicitly.`,
558
+ meta: workspace.branch ?? workspace.id,
559
+ icon: "ti-alert-triangle",
560
+ view: "orchestrators",
561
+ metadata: { source: "server", maintenanceJobId: "workspace-orphan-reaper", workspaceId: workspace.id, worktreePath: workspace.worktreePath, branch: workspace.branch, ownerAgentId: workspace.ownerAgentId, deadOwner: true },
562
+ });
563
+ }
564
+ // Drop grace-tracker entries for worktrees that are gone (reaped/landed) so a
565
+ // future re-orphaning re-arms the window from scratch.
566
+ for (const key of deadOwnerTracker.keys()) if (!liveDeadOwnerKeys.has(key)) deadOwnerTracker.delete(key);
567
+
413
568
  // Forget cooldown entries for orphans that are gone (reaped/recovered) so a
414
569
  // future re-orphaning of the same path re-announces immediately.
415
- const liveKeys = new Set([...orphans.map((o) => o.worktreePath), ...missingWorktrees.map((m) => `missing:${m.worktreePath}`)]);
570
+ const liveKeys = new Set([
571
+ ...orphans.map((o) => o.worktreePath),
572
+ ...missingWorktrees.map((m) => `missing:${m.worktreePath}`),
573
+ ...liveDeadOwnerKeys,
574
+ ]);
416
575
  for (const key of flaggedAt.keys()) if (!liveKeys.has(key) && !reaped.includes(key)) flaggedAt.delete(key);
417
576
 
418
577
  return {
@@ -422,6 +581,8 @@ export async function reapOrphanedWorktrees(): Promise<Record<string, unknown>>
422
581
  autoAbandoned,
423
582
  flaggedMissingWorktrees,
424
583
  missingWorktrees: missingWorktrees.map((m) => m.workspaceId),
584
+ deadOwnerReaped,
585
+ deadOwnerFlagged,
425
586
  reapEnabled,
426
587
  };
427
588
  }