agent-relay-server 0.11.6 → 0.11.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/docs/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "Agent Relay API",
5
- "version": "0.11.3",
5
+ "version": "0.11.4",
6
6
  "description": "Real-time message bus for inter-agent communication. Agent-first: this spec is designed for machine consumption — agents can self-discover the full API surface via GET /api/spec.",
7
7
  "license": {
8
8
  "name": "MIT",
@@ -3682,6 +3682,35 @@
3682
3682
  ]
3683
3683
  }
3684
3684
  },
3685
+ "/api/workspaces/stewards": {
3686
+ "get": {
3687
+ "operationId": "getWorkspaceStewards",
3688
+ "summary": "List repo stewards and merge leases",
3689
+ "tags": [
3690
+ "Other"
3691
+ ],
3692
+ "description": "Per-repo coordination state for isolated workspaces. Returns `{ stewards, mergeLeases }`. Each steward record (`repoRoot`, `stewardAgentId`, `lastStewardAgentId`, `electedAt`) is persistent — it survives a full all-agents-offline gap (steward goes null/dormant, last-known preserved) and is re-elected when an agent rejoins the repo. Each merge lease (`repoRoot`, `workspaceId`, `commandId`, `holder`, `expiresAt`) marks an in-flight base merge; only one may be held per repo so concurrent merges into base are serialized.",
3693
+ "responses": {
3694
+ "200": {
3695
+ "description": "Success",
3696
+ "content": {
3697
+ "application/json": {}
3698
+ }
3699
+ }
3700
+ },
3701
+ "security": [
3702
+ {
3703
+ "bearerAuth": []
3704
+ },
3705
+ {
3706
+ "tokenHeader": []
3707
+ },
3708
+ {
3709
+ "tokenQuery": []
3710
+ }
3711
+ ]
3712
+ }
3713
+ },
3685
3714
  "/api/workspaces/{id}": {
3686
3715
  "get": {
3687
3716
  "operationId": "getWorkspaceById",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-relay-server",
3
- "version": "0.11.6",
3
+ "version": "0.11.8",
4
4
  "description": "Lightweight HTTP message relay for inter-agent communication across machines",
5
5
  "module": "src/index.ts",
6
6
  "type": "module",
@@ -39,14 +39,16 @@
39
39
  "postinstall": "node scripts/install-bin-shim.cjs",
40
40
  "start": "bun run src/index.ts",
41
41
  "dev": "bun --watch run src/index.ts",
42
- "dev:dashboard": "cd dashboard && npx vite",
42
+ "dev:dashboard": "cd dashboard && bun run dev",
43
43
  "build:sdk": "cd sdk && bun run build",
44
- "build:dashboard": "bun run build:sdk && cd dashboard && npx vite build",
44
+ "build:dashboard": "bun run build:sdk && cd dashboard && bun run build",
45
45
  "test": "bun test",
46
46
  "smoke:spawn": "bun run scripts/orchestrator-spawn-smoke.ts",
47
47
  "typecheck": "tsc --noEmit",
48
48
  "docs:api": "bun run scripts/extract-api-spec.ts",
49
- "docs:api:check": "bun run scripts/extract-api-spec.ts --check"
49
+ "docs:api:check": "bun run scripts/extract-api-spec.ts --check",
50
+ "release": "bun run scripts/release.ts",
51
+ "publish:ci": "bun run scripts/publish-ci.ts"
50
52
  },
51
53
  "keywords": [
52
54
  "agent-relay",
package/src/config.ts CHANGED
@@ -24,6 +24,10 @@ export const OFFLINE_PRUNE_MS = envPositiveInt("OFFLINE_PRUNE_MS", DAY_MS); // 2
24
24
  export const REAP_INTERVAL_MS = envPositiveInt("REAP_INTERVAL_MS", 60_000); // reaper cadence
25
25
  export const CLAIM_LEASE_MS = envPositiveInt("AGENT_RELAY_CLAIM_LEASE_MS", 1_800_000); // 30min claim lease
26
26
  export const POOL_CLAIM_LEASE_MS = envPositiveInt("AGENT_RELAY_POOL_CLAIM_LEASE_MS", STALE_TTL_MS * 3); // pool binding lease
27
+ // Per-repo merge serialization lease — only one base merge may run at a time per
28
+ // repo. Held from when a workspace.merge command is dispatched until it settles
29
+ // (or this TTL expires, in case the orchestrator never reports back).
30
+ export const WORKSPACE_MERGE_LEASE_MS = envPositiveInt("AGENT_RELAY_WORKSPACE_MERGE_LEASE_MS", 900_000); // 15min
27
31
 
28
32
  // Max body size for any POST/PATCH request (64 KiB).
29
33
  export const MAX_BODY_BYTES = 64 * 1024;
package/src/db.ts CHANGED
@@ -71,7 +71,7 @@ import type {
71
71
  WorkspaceRecord,
72
72
  WorkspaceStatus,
73
73
  } from "./types";
74
- import { STALE_TTL_MS, DAY_MS, CLAIM_LEASE_MS, POOL_CLAIM_LEASE_MS } from "./config";
74
+ import { STALE_TTL_MS, DAY_MS, CLAIM_LEASE_MS, POOL_CLAIM_LEASE_MS, WORKSPACE_MERGE_LEASE_MS } from "./config";
75
75
 
76
76
  let db: Database;
77
77
  const CONTEXT_SNAPSHOT_DEBOUNCE_MS = 60_000;
@@ -379,6 +379,31 @@ export function initDb(path: string = "agent-relay.db"): Database {
379
379
  CREATE INDEX IF NOT EXISTS idx_workspaces_owner_agent ON workspaces(owner_agent_id);
380
380
  CREATE INDEX IF NOT EXISTS idx_workspaces_policy ON workspaces(owner_policy_name);
381
381
 
382
+ -- Persistent per-repo steward record. Keyed to the repo, not a live agent, so
383
+ -- it survives a full all-agents-offline gap: steward_agent_id goes NULL
384
+ -- (dormant) while last_steward_agent_id preserves continuity, and the row is
385
+ -- re-filled when an agent rejoins the repo. This is the durable backing store
386
+ -- the steward column on workspace rows mirrors for display/maintenance.
387
+ CREATE TABLE IF NOT EXISTS repo_stewards (
388
+ repo_root TEXT PRIMARY KEY,
389
+ steward_agent_id TEXT,
390
+ last_steward_agent_id TEXT,
391
+ elected_at INTEGER,
392
+ updated_at INTEGER NOT NULL
393
+ );
394
+
395
+ -- Per-repo merge serialization lease. Exactly one base merge may be in flight
396
+ -- per repo; a second merge request is rejected until the holder settles or the
397
+ -- lease expires. Atomicity comes from the repo_root PRIMARY KEY + expiry guard.
398
+ CREATE TABLE IF NOT EXISTS workspace_merge_leases (
399
+ repo_root TEXT PRIMARY KEY,
400
+ workspace_id TEXT NOT NULL,
401
+ command_id TEXT,
402
+ holder TEXT,
403
+ acquired_at INTEGER NOT NULL,
404
+ expires_at INTEGER NOT NULL
405
+ );
406
+
382
407
  CREATE TABLE IF NOT EXISTS tasks (
383
408
  id INTEGER PRIMARY KEY AUTOINCREMENT,
384
409
  source TEXT NOT NULL,
@@ -1692,6 +1717,9 @@ export function upsertAgent(input: RegisterAgentInput): AgentCard {
1692
1717
  const agent = getAgent(input.id)!;
1693
1718
  if (agent.kind === "channel") upsertChannelForAgent(agent);
1694
1719
  evaluatePoolBindings();
1720
+ // A (re)joining agent may revive a dormant repo steward — re-elect for the
1721
+ // repos it owns live workspaces in (issue #157, steward survives offline gap).
1722
+ if (agent.status !== "offline") electWorkspaceStewardsForAgent(agent.id);
1695
1723
  return agent;
1696
1724
  }
1697
1725
 
@@ -4140,13 +4168,16 @@ function relayConversationId(message: Message): string | undefined {
4140
4168
  }
4141
4169
 
4142
4170
  function isCoveredByLaterAgentResponse(message: Message, agentId: string): boolean {
4171
+ // Order by id, not created_at: ids are monotonic insertion order, so this is
4172
+ // robust when a reply lands in the same millisecond as the message it covers
4173
+ // (created_at > … strictly would miss it, leaving the message wrongly pending).
4143
4174
  const replies = (db.prepare(`
4144
4175
  ${MSG_SELECT}
4145
4176
  WHERE m.from_agent = ?
4146
- AND m.created_at > ?
4147
- ORDER BY m.created_at ASC
4177
+ AND m.id > ?
4178
+ ORDER BY m.id ASC
4148
4179
  LIMIT 200
4149
- `).all(agentId, message.createdAt) as any[]).map(rowToMessage);
4180
+ `).all(agentId, message.id) as any[]).map(rowToMessage);
4150
4181
 
4151
4182
  const conversationId = relayConversationId(message);
4152
4183
  return replies.some((reply) => {
@@ -5108,31 +5139,203 @@ export function updateWorkspaceStatus(id: string, status: WorkspaceStatus, metad
5108
5139
  return getWorkspace(id);
5109
5140
  }
5110
5141
 
5142
+ // Workspace statuses that count as "live" for stewardship — an agent owning one
5143
+ // of these is a candidate steward; the repo is worth coordinating.
5144
+ const STEWARD_LIVE_STATUSES = "'active', 'ready', 'conflict', 'review_requested', 'merge_planned'";
5145
+
5146
+ export interface RepoStewardRecord {
5147
+ repoRoot: string;
5148
+ stewardAgentId?: string;
5149
+ lastStewardAgentId?: string;
5150
+ electedAt?: number;
5151
+ updatedAt: number;
5152
+ }
5153
+
5154
+ function rowToRepoSteward(row: any): RepoStewardRecord {
5155
+ return {
5156
+ repoRoot: row.repo_root,
5157
+ stewardAgentId: row.steward_agent_id ?? undefined,
5158
+ lastStewardAgentId: row.last_steward_agent_id ?? undefined,
5159
+ electedAt: row.elected_at ?? undefined,
5160
+ updatedAt: row.updated_at,
5161
+ };
5162
+ }
5163
+
5164
+ export function getRepoSteward(repoRoot: string): RepoStewardRecord | null {
5165
+ const row = db.prepare("SELECT * FROM repo_stewards WHERE repo_root = ?").get(repoRoot) as any;
5166
+ return row ? rowToRepoSteward(row) : null;
5167
+ }
5168
+
5169
+ export function listRepoStewards(): RepoStewardRecord[] {
5170
+ return (db.prepare("SELECT * FROM repo_stewards ORDER BY updated_at DESC").all() as any[]).map(rowToRepoSteward);
5171
+ }
5172
+
5173
+ // Persist the elected steward for a repo. The row is never deleted, so a repo's
5174
+ // stewardship survives a full all-agents-offline gap (steward goes NULL/dormant,
5175
+ // last_steward_agent_id keeps continuity) and resumes on the next agent join.
5176
+ function upsertRepoSteward(repoRoot: string, steward: string | null, now: number): void {
5177
+ db.prepare(`
5178
+ INSERT INTO repo_stewards (repo_root, steward_agent_id, last_steward_agent_id, elected_at, updated_at)
5179
+ VALUES ($repoRoot, $steward, $steward, $electedAt, $now)
5180
+ ON CONFLICT(repo_root) DO UPDATE SET
5181
+ steward_agent_id = $steward,
5182
+ last_steward_agent_id = coalesce($steward, repo_stewards.last_steward_agent_id),
5183
+ elected_at = CASE
5184
+ WHEN $steward IS NOT NULL AND $steward IS NOT repo_stewards.steward_agent_id THEN $now
5185
+ ELSE repo_stewards.elected_at
5186
+ END,
5187
+ updated_at = $now
5188
+ `).run({ $repoRoot: repoRoot, $steward: steward, $electedAt: steward ? now : null, $now: now });
5189
+ }
5190
+
5111
5191
  function electWorkspaceStewards(repoRoot?: string): void {
5112
5192
  const params: string[] = repoRoot ? [repoRoot] : [];
5113
5193
  const repoRows = db.prepare(`
5114
5194
  SELECT DISTINCT repo_root FROM workspaces
5115
- WHERE status IN ('active', 'ready', 'conflict', 'review_requested', 'merge_planned')
5195
+ WHERE status IN (${STEWARD_LIVE_STATUSES})
5116
5196
  ${repoRoot ? "AND repo_root = ?" : ""}
5117
5197
  `).all(...params) as Array<{ repo_root: string }>;
5198
+ const now = Date.now();
5118
5199
  for (const row of repoRows) {
5119
- const current = db.prepare(`
5120
- SELECT steward_agent_id FROM workspaces
5121
- WHERE repo_root = ? AND steward_agent_id IS NOT NULL AND status IN ('active', 'ready', 'conflict', 'review_requested', 'merge_planned')
5122
- LIMIT 1
5123
- `).get(row.repo_root) as { steward_agent_id?: string } | undefined;
5124
- const currentAgent = current?.steward_agent_id ? getAgent(current.steward_agent_id) : null;
5125
- const steward = currentAgent && currentAgent.status !== "offline" && current?.steward_agent_id
5126
- ? current.steward_agent_id
5127
- : ((db.prepare(`
5128
- SELECT owner_agent_id FROM workspaces
5129
- WHERE repo_root = ? AND owner_agent_id IS NOT NULL AND status IN ('active', 'ready', 'conflict', 'review_requested', 'merge_planned')
5130
- ORDER BY created_at ASC
5131
- LIMIT 1
5132
- `).get(row.repo_root) as { owner_agent_id?: string } | undefined)?.owner_agent_id);
5133
- db.prepare("UPDATE workspaces SET steward_agent_id = ?, updated_at = ? WHERE repo_root = ? AND status IN ('active', 'ready', 'conflict', 'review_requested', 'merge_planned')")
5134
- .run(steward ?? null, Date.now(), row.repo_root);
5200
+ // Candidate pool: owners of live workspaces in this repo who are online,
5201
+ // oldest first. A steward must be an online agent actively in the repo — an
5202
+ // offline agent can't coordinate, so it is never elected (the old bug).
5203
+ const pool = (db.prepare(`
5204
+ SELECT w.owner_agent_id AS id, MIN(w.created_at) AS created_at
5205
+ FROM workspaces w JOIN agents a ON a.id = w.owner_agent_id
5206
+ WHERE w.repo_root = ? AND w.owner_agent_id IS NOT NULL
5207
+ AND a.status != 'offline' AND w.status IN (${STEWARD_LIVE_STATUSES})
5208
+ GROUP BY w.owner_agent_id
5209
+ ORDER BY created_at ASC
5210
+ `).all(row.repo_root) as Array<{ id: string }>).map((r) => r.id);
5211
+
5212
+ // Keep the current steward if it is still in the pool (stable election);
5213
+ // otherwise promote the oldest online owner, else go dormant (null).
5214
+ const current = getRepoSteward(row.repo_root)?.stewardAgentId ?? null;
5215
+ const steward = (current && pool.includes(current) ? current : pool[0]) ?? null;
5216
+
5217
+ upsertRepoSteward(row.repo_root, steward, now);
5218
+ // Mirror onto live workspace rows only when the steward actually changed, so
5219
+ // re-elections don't churn updated_at and reset the auto-abandon clock for a
5220
+ // dormant repo (a stranded review_requested must still age out).
5221
+ if (steward !== current) {
5222
+ db.prepare(`UPDATE workspaces SET steward_agent_id = ?, updated_at = ? WHERE repo_root = ? AND status IN (${STEWARD_LIVE_STATUSES})`)
5223
+ .run(steward, now, row.repo_root);
5224
+ }
5225
+ }
5226
+ }
5227
+
5228
+ // Public re-election trigger that does not change any workspace status — used by
5229
+ // maintenance to revive a dormant steward (e.g. on the next agent join) before
5230
+ // deciding whether a stranded worktree needs escalation.
5231
+ export function reelectRepoSteward(repoRoot: string): void {
5232
+ electWorkspaceStewards(repoRoot);
5233
+ }
5234
+
5235
+ // Merge a metadata patch into a workspace WITHOUT bumping updated_at or running a
5236
+ // steward election. For maintenance bookkeeping (stranded/escalation markers)
5237
+ // that must not disturb age-based GC timers. undefined values delete keys.
5238
+ export function patchWorkspaceMetadata(id: string, patch: Record<string, unknown>): WorkspaceRecord | null {
5239
+ const existing = getWorkspace(id);
5240
+ if (!existing) return null;
5241
+ const next = { ...existing.metadata };
5242
+ for (const [k, v] of Object.entries(patch)) {
5243
+ if (v === undefined) delete next[k];
5244
+ else next[k] = v;
5135
5245
  }
5246
+ db.prepare("UPDATE workspaces SET metadata = ? WHERE id = ?").run(JSON.stringify(next), id);
5247
+ return getWorkspace(id);
5248
+ }
5249
+
5250
+ // Re-elect stewards for every repo where an agent owns a live workspace. Called
5251
+ // when an agent (re)registers so a dormant repo regains a steward on rejoin
5252
+ // without a full unscoped sweep.
5253
+ function electWorkspaceStewardsForAgent(agentId: string): void {
5254
+ const repos = db.prepare(`
5255
+ SELECT DISTINCT repo_root FROM workspaces
5256
+ WHERE owner_agent_id = ? AND status IN (${STEWARD_LIVE_STATUSES})
5257
+ `).all(agentId) as Array<{ repo_root: string }>;
5258
+ for (const r of repos) electWorkspaceStewards(r.repo_root);
5259
+ }
5260
+
5261
+ // --- Per-repo merge serialization lease (issue #157) -----------------------
5262
+
5263
+ export interface MergeLeaseRecord {
5264
+ repoRoot: string;
5265
+ workspaceId: string;
5266
+ commandId?: string;
5267
+ holder?: string;
5268
+ acquiredAt: number;
5269
+ expiresAt: number;
5270
+ }
5271
+
5272
+ function rowToMergeLease(row: any): MergeLeaseRecord {
5273
+ return {
5274
+ repoRoot: row.repo_root,
5275
+ workspaceId: row.workspace_id,
5276
+ commandId: row.command_id ?? undefined,
5277
+ holder: row.holder ?? undefined,
5278
+ acquiredAt: row.acquired_at,
5279
+ expiresAt: row.expires_at,
5280
+ };
5281
+ }
5282
+
5283
+ export function getMergeLease(repoRoot: string): MergeLeaseRecord | null {
5284
+ const row = db.prepare("SELECT * FROM workspace_merge_leases WHERE repo_root = ?").get(repoRoot) as any;
5285
+ return row ? rowToMergeLease(row) : null;
5286
+ }
5287
+
5288
+ export function listMergeLeases(): MergeLeaseRecord[] {
5289
+ return (db.prepare("SELECT * FROM workspace_merge_leases ORDER BY acquired_at DESC").all() as any[]).map(rowToMergeLease);
5290
+ }
5291
+
5292
+ export function releaseExpiredMergeLeases(now: number = Date.now()): string[] {
5293
+ const expired = db.prepare("SELECT repo_root FROM workspace_merge_leases WHERE expires_at <= ?").all(now) as Array<{ repo_root: string }>;
5294
+ if (!expired.length) return [];
5295
+ db.prepare("DELETE FROM workspace_merge_leases WHERE expires_at <= ?").run(now);
5296
+ return expired.map((r) => r.repo_root);
5297
+ }
5298
+
5299
+ // Atomically acquire the per-repo merge lease. Succeeds if no live lease is held
5300
+ // for the repo (or the existing one has expired). Serialized via db.transaction
5301
+ // so two concurrent merge requests for the same repo can't both win.
5302
+ export function acquireMergeLease(
5303
+ repoRoot: string,
5304
+ workspaceId: string,
5305
+ holder?: string,
5306
+ ): { ok: true; lease: MergeLeaseRecord } | { ok: false; lease: MergeLeaseRecord } {
5307
+ return db.transaction(() => {
5308
+ const now = Date.now();
5309
+ const existing = getMergeLease(repoRoot);
5310
+ if (existing && existing.expiresAt > now) return { ok: false as const, lease: existing };
5311
+ const expiresAt = now + WORKSPACE_MERGE_LEASE_MS;
5312
+ db.prepare(`
5313
+ INSERT INTO workspace_merge_leases (repo_root, workspace_id, command_id, holder, acquired_at, expires_at)
5314
+ VALUES (?, ?, NULL, ?, ?, ?)
5315
+ ON CONFLICT(repo_root) DO UPDATE SET
5316
+ workspace_id = excluded.workspace_id, command_id = NULL, holder = excluded.holder,
5317
+ acquired_at = excluded.acquired_at, expires_at = excluded.expires_at
5318
+ `).run(repoRoot, workspaceId, holder ?? null, now, expiresAt);
5319
+ return { ok: true as const, lease: getMergeLease(repoRoot)! };
5320
+ })();
5321
+ }
5322
+
5323
+ // Attach the dispatched command id to a held lease so it can be released by
5324
+ // command id when the merge settles.
5325
+ export function setMergeLeaseCommand(repoRoot: string, commandId: string): void {
5326
+ db.prepare("UPDATE workspace_merge_leases SET command_id = ? WHERE repo_root = ?").run(commandId, repoRoot);
5327
+ }
5328
+
5329
+ // Release a merge lease. Guard by commandId/workspaceId when known so a stale
5330
+ // release can't drop a newer lease for the same repo.
5331
+ export function releaseMergeLease(opts: { repoRoot?: string; commandId?: string; workspaceId?: string }): boolean {
5332
+ const where: string[] = [];
5333
+ const params: string[] = [];
5334
+ if (opts.repoRoot) { where.push("repo_root = ?"); params.push(opts.repoRoot); }
5335
+ if (opts.commandId) { where.push("command_id = ?"); params.push(opts.commandId); }
5336
+ if (opts.workspaceId) { where.push("workspace_id = ?"); params.push(opts.workspaceId); }
5337
+ if (!where.length) return false;
5338
+ return db.prepare(`DELETE FROM workspace_merge_leases WHERE ${where.join(" AND ")}`).run(...params).changes > 0;
5136
5339
  }
5137
5340
 
5138
5341
  export function deleteOrchestrator(id: string): boolean {
@@ -9,16 +9,22 @@ import {
9
9
  createActivityEvent,
10
10
  evaluatePoolBindings,
11
11
  expireQueuedMessages,
12
+ getAgent,
12
13
  getDb,
14
+ getRepoSteward,
15
+ getWorkspace,
13
16
  listOrchestrators,
14
17
  listWorkspaces,
18
+ patchWorkspaceMetadata,
15
19
  pruneOfflineAgents,
16
20
  pruneOldMessages,
17
21
  deleteWorkspace,
18
22
  pruneOrphanedSharedWorkspaces,
19
23
  reapStaleAgents,
20
24
  reapStaleOrchestrators,
25
+ reelectRepoSteward,
21
26
  releaseExpiredClaims,
27
+ releaseExpiredMergeLeases,
22
28
  releaseOrphanedTasks,
23
29
  sendMessage,
24
30
  sweepArtifacts,
@@ -49,6 +55,15 @@ const CONFLICT_SCAN_INTERVAL_MS = Number(process.env.AGENT_RELAY_CONFLICT_SCAN_I
49
55
  const WORKSPACE_RETENTION_MS = Number(process.env.AGENT_RELAY_WORKSPACE_RETENTION_MS) || DAY_MS;
50
56
  const WORKSPACE_REVIEW_TTL_MS = Number(process.env.AGENT_RELAY_WORKSPACE_REVIEW_TTL_MS) || 3 * DAY_MS;
51
57
  const WORKSPACE_GC_INTERVAL_MS = Number(process.env.AGENT_RELAY_WORKSPACE_GC_INTERVAL_MS) || 60 * 60 * 1000;
58
+ // How long a stranded review_requested/conflict worktree (no online steward) may
59
+ // sit before escalating to the configured fallback target, and the durable
60
+ // escalation target itself (`policy:<name>`, `label:<name>`, `cap:<name>`, an
61
+ // agent id, or `broadcast`). Read at call-time so config changes take effect
62
+ // without a restart (issue #157).
63
+ const stewardEscalationMs = () => Number(process.env.AGENT_RELAY_WORKSPACE_STEWARD_ESCALATION_MS) || 60 * 60 * 1000;
64
+ const stewardFallbackTarget = () => (process.env.AGENT_RELAY_WORKSPACE_STEWARD_FALLBACK || "").trim();
65
+ // Statuses that need an owner — a stranded one of these is what escalation rescues.
66
+ const STRANDABLE_STATUSES = new Set<WorkspaceStatus>(["review_requested", "conflict"]);
52
67
  // Live statuses worth scanning. Terminal (cleaned/merged/abandoned) and
53
68
  // in-flight (cleanup_requested) states are skipped.
54
69
  const CONFLICT_SCAN_STATUSES = new Set<WorkspaceStatus>(["active", "ready", "review_requested", "merge_planned", "conflict"]);
@@ -413,11 +428,27 @@ async function scanWorkspaceConflicts(): Promise<Record<string, unknown>> {
413
428
  return { scanned: candidates.length, flagged, cleared, notifiedStewards };
414
429
  }
415
430
 
431
+ // Send a system DM, swallowing failures (a stale/missing/misconfigured target
432
+ // must never break the GC sweep). Returns the target on success, null otherwise.
433
+ function notifyTarget(target: string, subject: string, body: string, payload: Record<string, unknown>): string | null {
434
+ if (!target) return null;
435
+ try {
436
+ emitNewMessage(sendMessage({ from: "system", to: target, kind: "system", subject, body, payload }));
437
+ return target;
438
+ } catch {
439
+ return null;
440
+ }
441
+ }
442
+
416
443
  async function workspaceGC(): Promise<Record<string, unknown>> {
417
444
  const now = Date.now();
418
445
  const cutoff = now - WORKSPACE_RETENTION_MS;
419
446
  const reviewCutoff = now - WORKSPACE_REVIEW_TTL_MS;
420
447
 
448
+ // 0. Free any merge leases whose holder never reported back (orchestrator died
449
+ // mid-merge). The lease TTL is the safety net; this just reclaims them eagerly.
450
+ const releasedLeaseRepos = releaseExpiredMergeLeases(now);
451
+
421
452
  // 1. Prune terminal rows past retention
422
453
  const all = listWorkspaces();
423
454
  const terminalIds: string[] = [];
@@ -428,29 +459,84 @@ async function workspaceGC(): Promise<Record<string, unknown>> {
428
459
  }
429
460
  }
430
461
 
431
- // 2. Auto-abandon stale review_requested worktrees
462
+ // 2. Rescue stranded review_requested/conflict worktrees (issue #157). A
463
+ // worktree is "stranded" when its steward is gone (all repo agents offline).
464
+ // Re-elect first — an agent may have rejoined — and hand off to the new
465
+ // steward; if none can be elected past the TTL, escalate to the fallback
466
+ // target so it never rots in silence. Bookkeeping uses patchWorkspaceMetadata
467
+ // (no updated_at bump) so the auto-abandon clock below keeps ticking.
468
+ const escalatedIds: string[] = [];
469
+ const reassignedIds: string[] = [];
470
+ const escalationTargets: string[] = [];
471
+ const escalationMs = stewardEscalationMs();
472
+ const fallbackTarget = stewardFallbackTarget();
473
+ for (const ws of all) {
474
+ if (!STRANDABLE_STATUSES.has(ws.status) || ws.mode !== "isolated" || !ws.worktreePath) continue;
475
+ reelectRepoSteward(ws.repoRoot);
476
+ const fresh = getWorkspace(ws.id);
477
+ if (!fresh || !STRANDABLE_STATUSES.has(fresh.status)) continue;
478
+ const meta = fresh.metadata as Record<string, unknown>;
479
+ const steward = fresh.stewardAgentId;
480
+ const stewardOnline = Boolean(steward && getAgent(steward) && getAgent(steward)!.status !== "offline");
481
+ const strandedAt = typeof meta.strandedAt === "number" ? meta.strandedAt : undefined;
482
+
483
+ if (stewardOnline) {
484
+ // An online steward owns it. If it was previously stranded and this
485
+ // steward hasn't been told, hand it off explicitly, then clear markers.
486
+ if (strandedAt !== undefined && meta.strandedNotifiedSteward !== steward) {
487
+ const sent = notifyTarget(
488
+ steward!,
489
+ "Workspace stewardship reassigned",
490
+ `You are now steward for ${fresh.repoRoot}. Workspace \`${fresh.branch ?? fresh.id}\` is ${fresh.status} and was stranded without an online steward — please coordinate ${fresh.status === "conflict" ? "conflict resolution" : "review/merge"}.`,
491
+ { kind: "workspace.steward-reassigned", workspaceId: fresh.id, repoRoot: fresh.repoRoot, branch: fresh.branch, status: fresh.status },
492
+ );
493
+ if (sent) reassignedIds.push(fresh.id);
494
+ }
495
+ patchWorkspaceMetadata(fresh.id, { strandedAt: undefined, escalatedAt: undefined, strandedNotifiedSteward: steward });
496
+ continue;
497
+ }
498
+
499
+ // Stranded: no online steward could be elected.
500
+ if (strandedAt === undefined) { patchWorkspaceMetadata(fresh.id, { strandedAt: now }); continue; }
501
+ if (now - strandedAt < escalationMs || meta.escalatedAt) continue;
502
+ const sent = notifyTarget(
503
+ fallbackTarget,
504
+ "Stranded workspace needs an owner",
505
+ `Workspace \`${fresh.branch ?? fresh.id}\` in ${fresh.repoRoot} is ${fresh.status} with no online steward (all repo agents offline) for ${Math.round((now - strandedAt) / (60 * 60 * 1000))}h. Please coordinate ${fresh.status === "conflict" ? "conflict resolution" : "review/merge"} or clean up the worktree.`,
506
+ { kind: "workspace.stranded-escalation", workspaceId: fresh.id, repoRoot: fresh.repoRoot, branch: fresh.branch, status: fresh.status, strandedAt },
507
+ );
508
+ if (sent) escalationTargets.push(sent);
509
+ patchWorkspaceMetadata(fresh.id, { escalatedAt: now });
510
+ escalatedIds.push(fresh.id);
511
+ createActivityEvent({
512
+ clientId: `workspace-gc-escalate-${fresh.id}-${now}`,
513
+ kind: "state",
514
+ title: "Workspace escalated",
515
+ body: `${fresh.branch ?? fresh.id} in ${fresh.repoRoot} — stranded ${fresh.status} escalated${fallbackTarget ? ` to ${fallbackTarget}` : " (no fallback configured)"}`,
516
+ meta: fresh.branch ?? fresh.id,
517
+ icon: "ti-alert-octagon",
518
+ view: "orchestrators",
519
+ metadata: { source: "server", maintenanceJobId: "workspace-gc", workspaceId: fresh.id, fallback: fallbackTarget || null },
520
+ });
521
+ }
522
+
523
+ // 3. Auto-abandon stale review_requested worktrees
432
524
  const abandonedIds: string[] = [];
433
525
  const notifiedStewards: string[] = [];
434
526
  for (const ws of all) {
435
527
  if (ws.status === "review_requested" && ws.updatedAt < reviewCutoff) {
436
528
  updateWorkspaceStatus(ws.id, "abandoned", { autoAbandoned: true, abandonedReason: "review_requested TTL exceeded", abandonedAt: now });
437
529
  abandonedIds.push(ws.id);
438
- if (ws.stewardAgentId) {
439
- try {
440
- const msg = sendMessage({
441
- from: "system",
442
- to: ws.stewardAgentId,
443
- kind: "system",
444
- subject: "Workspace auto-abandoned",
445
- body: `Workspace \`${ws.branch ?? ws.id}\` in ${ws.repoRoot} was auto-abandoned after ${Math.round(WORKSPACE_REVIEW_TTL_MS / DAY_MS)}d without steward action. Run workspace cleanup to reclaim the worktree.`,
446
- payload: { kind: "workspace.auto-abandoned", workspaceId: ws.id, repoRoot: ws.repoRoot, branch: ws.branch },
447
- });
448
- emitNewMessage(msg);
449
- notifiedStewards.push(ws.stewardAgentId);
450
- } catch {
451
- // Steward gone — activity event is enough.
452
- }
453
- }
530
+ // Notify the steward if one exists, else the configured fallback so a
531
+ // stranded abandon isn't silent (issue #157).
532
+ const target = ws.stewardAgentId ?? fallbackTarget;
533
+ const sent = notifyTarget(
534
+ target,
535
+ "Workspace auto-abandoned",
536
+ `Workspace \`${ws.branch ?? ws.id}\` in ${ws.repoRoot} was auto-abandoned after ${Math.round(WORKSPACE_REVIEW_TTL_MS / DAY_MS)}d without steward action. Run workspace cleanup to reclaim the worktree.`,
537
+ { kind: "workspace.auto-abandoned", workspaceId: ws.id, repoRoot: ws.repoRoot, branch: ws.branch },
538
+ );
539
+ if (sent) notifiedStewards.push(sent);
454
540
  createActivityEvent({
455
541
  clientId: `workspace-gc-abandon-${ws.id}-${now}`,
456
542
  kind: "state",
@@ -483,7 +569,16 @@ async function workspaceGC(): Promise<Record<string, unknown>> {
483
569
  pruneCommands.push(command.id);
484
570
  }
485
571
 
486
- return { prunedTerminal: terminalIds, autoAbandoned: abandonedIds, notifiedStewards, pruneCommands };
572
+ return {
573
+ prunedTerminal: terminalIds,
574
+ autoAbandoned: abandonedIds,
575
+ notifiedStewards,
576
+ pruneCommands,
577
+ releasedLeaseRepos,
578
+ escalated: escalatedIds,
579
+ reassigned: reassignedIds,
580
+ escalationTargets,
581
+ };
487
582
  }
488
583
 
489
584
  let timer: Timer | null = null;
package/src/routes.ts CHANGED
@@ -85,6 +85,11 @@ import {
85
85
  getWorkspace,
86
86
  listWorkspaces,
87
87
  updateWorkspaceStatus,
88
+ acquireMergeLease,
89
+ setMergeLeaseCommand,
90
+ releaseMergeLease,
91
+ listRepoStewards,
92
+ listMergeLeases,
88
93
  deleteWorkspace,
89
94
  deleteOrchestrator,
90
95
  evaluatePoolBindings,
@@ -3799,6 +3804,10 @@ const getWorkspaceById: Handler = (_req, params) => {
3799
3804
  return json(workspace);
3800
3805
  };
3801
3806
 
3807
+ // Per-repo coordination state: persistent steward records (survive offline gaps)
3808
+ // and in-flight merge serialization leases (issue #157).
3809
+ const getWorkspaceStewards: Handler = () => json({ stewards: listRepoStewards(), mergeLeases: listMergeLeases() });
3810
+
3802
3811
  // Proxy a read-only workspace interrogation to the owning orchestrator's host
3803
3812
  // API. Degrades to { available: false } rather than erroring so the dashboard
3804
3813
  // can render a placeholder when the host is offline or there's no worktree.
@@ -3931,6 +3940,7 @@ const postWorkspaceOrphanReclaim: Handler = async (req) => {
3931
3940
  const postWorkspaceAction: Handler = async (req, params) => {
3932
3941
  const parsed = await parseBody<unknown>(req);
3933
3942
  if (!parsed.ok) return error(parsed.error, parsed.status);
3943
+ let mergeLeaseRepo: string | undefined;
3934
3944
  try {
3935
3945
  if (!isRecord(parsed.body)) return error("body required");
3936
3946
  const workspace = getWorkspace(params.id!);
@@ -3949,6 +3959,18 @@ const postWorkspaceAction: Handler = async (req, params) => {
3949
3959
  const denied = authorizeRoute(req, { scope: requiresCommand ? "command:write" : "agent:write", resource: { agentId, cwd: workspace.worktreePath } });
3950
3960
  if (denied) return denied;
3951
3961
  if (action === "status") return json(workspace);
3962
+ // Serialize base merges per repo: acquire the merge lease BEFORE mutating
3963
+ // status so a losing request leaves the workspace untouched (issue #157).
3964
+ if (action === "merge") {
3965
+ const lease = acquireMergeLease(workspace.repoRoot, workspace.id, agentId ?? "dashboard");
3966
+ if (!lease.ok) {
3967
+ return error(
3968
+ `a merge is already in progress for ${workspace.repoRoot} (workspace ${lease.lease.workspaceId}); retry after it settles`,
3969
+ 409,
3970
+ );
3971
+ }
3972
+ mergeLeaseRepo = workspace.repoRoot;
3973
+ }
3952
3974
  const statusByAction: Record<string, WorkspaceStatus | undefined> = {
3953
3975
  status: undefined,
3954
3976
  ready: "ready",
@@ -3982,7 +4004,11 @@ const postWorkspaceAction: Handler = async (req, params) => {
3982
4004
  };
3983
4005
  if (action === "merge") {
3984
4006
  // Merge needs a live host: rebasing against a stale base later is unsafe.
3985
- if (!onlineOwner) return error("no online orchestrator available for workspace merge", 409);
4007
+ if (!onlineOwner) {
4008
+ releaseMergeLease({ repoRoot: workspace.repoRoot, workspaceId: workspace.id });
4009
+ mergeLeaseRepo = undefined;
4010
+ return error("no online orchestrator available for workspace merge", 409);
4011
+ }
3986
4012
  const strategy = cleanEnum(parsed.body.strategy, "strategy", ["pr", "rebase-ff", "auto"] as const, "auto");
3987
4013
  command = createCommand({
3988
4014
  type: "workspace.merge",
@@ -4014,6 +4040,9 @@ const postWorkspaceAction: Handler = async (req, params) => {
4014
4040
  params: { action: "cleanup", ...baseParams, deleteBranch: true, queued: owner.status !== "online" },
4015
4041
  });
4016
4042
  }
4043
+ // Bind the lease to the dispatched merge command so it's released by id
4044
+ // when the command settles (postCommandResult).
4045
+ if (action === "merge" && mergeLeaseRepo) setMergeLeaseCommand(mergeLeaseRepo, command.id);
4017
4046
  emitCommand(command);
4018
4047
  }
4019
4048
  auditEvent({
@@ -4029,6 +4058,9 @@ const postWorkspaceAction: Handler = async (req, params) => {
4029
4058
  });
4030
4059
  return json({ workspace: updated, command }, requiresCommand ? 202 : 200);
4031
4060
  } catch (e) {
4061
+ // A merge that acquired the lease but failed before dispatch must release it,
4062
+ // or the repo stays blocked until the TTL expires.
4063
+ if (mergeLeaseRepo) releaseMergeLease({ repoRoot: mergeLeaseRepo });
4032
4064
  if (e instanceof ValidationError) return error(e.message, 400);
4033
4065
  throw e;
4034
4066
  }
@@ -4336,6 +4368,11 @@ const patchCommand: Handler = async (req, params) => {
4336
4368
  }
4337
4369
  }
4338
4370
  if (command.type === "workspace.merge") {
4371
+ // Merge settled (either way) — free the per-repo merge lease so the next
4372
+ // base merge can proceed (issue #157).
4373
+ if (command.status === "succeeded" || command.status === "failed") {
4374
+ releaseMergeLease({ commandId: command.id });
4375
+ }
4339
4376
  if (command.status === "succeeded" && isRecord(command.result)) {
4340
4377
  const workspaceId = cleanString(command.result.workspaceId, "result.workspaceId", { max: 160 });
4341
4378
  const resultStatus = cleanEnum(command.result.status, "result.status", VALID_WORKSPACE_STATUSES) as WorkspaceStatus | undefined;
@@ -6259,6 +6296,7 @@ const routes: Route[] = [
6259
6296
  // Static segments before :id so "/workspaces/orphans" isn't captured as an id.
6260
6297
  route("GET", "/api/workspaces/orphans", getWorkspaceOrphans),
6261
6298
  route("POST", "/api/workspaces/orphans/reclaim", postWorkspaceOrphanReclaim),
6299
+ route("GET", "/api/workspaces/stewards", getWorkspaceStewards),
6262
6300
  route("GET", "/api/workspaces/:id", getWorkspaceById),
6263
6301
  route("GET", "/api/workspaces/:id/git-state", getWorkspaceGitState),
6264
6302
  route("GET", "/api/workspaces/:id/merge-preview", getWorkspaceMergePreview),