agent-relay-server 0.11.6 → 0.11.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/openapi.json +30 -1
- package/package.json +6 -4
- package/src/config.ts +4 -0
- package/src/db.ts +224 -21
- package/src/maintenance.ts +113 -18
- package/src/routes.ts +39 -1
package/docs/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "Agent Relay API",
|
|
5
|
-
"version": "0.11.
|
|
5
|
+
"version": "0.11.4",
|
|
6
6
|
"description": "Real-time message bus for inter-agent communication. Agent-first: this spec is designed for machine consumption — agents can self-discover the full API surface via GET /api/spec.",
|
|
7
7
|
"license": {
|
|
8
8
|
"name": "MIT",
|
|
@@ -3682,6 +3682,35 @@
|
|
|
3682
3682
|
]
|
|
3683
3683
|
}
|
|
3684
3684
|
},
|
|
3685
|
+
"/api/workspaces/stewards": {
|
|
3686
|
+
"get": {
|
|
3687
|
+
"operationId": "getWorkspaceStewards",
|
|
3688
|
+
"summary": "List repo stewards and merge leases",
|
|
3689
|
+
"tags": [
|
|
3690
|
+
"Other"
|
|
3691
|
+
],
|
|
3692
|
+
"description": "Per-repo coordination state for isolated workspaces. Returns `{ stewards, mergeLeases }`. Each steward record (`repoRoot`, `stewardAgentId`, `lastStewardAgentId`, `electedAt`) is persistent — it survives a full all-agents-offline gap (steward goes null/dormant, last-known preserved) and is re-elected when an agent rejoins the repo. Each merge lease (`repoRoot`, `workspaceId`, `commandId`, `holder`, `expiresAt`) marks an in-flight base merge; only one may be held per repo so concurrent merges into base are serialized.",
|
|
3693
|
+
"responses": {
|
|
3694
|
+
"200": {
|
|
3695
|
+
"description": "Success",
|
|
3696
|
+
"content": {
|
|
3697
|
+
"application/json": {}
|
|
3698
|
+
}
|
|
3699
|
+
}
|
|
3700
|
+
},
|
|
3701
|
+
"security": [
|
|
3702
|
+
{
|
|
3703
|
+
"bearerAuth": []
|
|
3704
|
+
},
|
|
3705
|
+
{
|
|
3706
|
+
"tokenHeader": []
|
|
3707
|
+
},
|
|
3708
|
+
{
|
|
3709
|
+
"tokenQuery": []
|
|
3710
|
+
}
|
|
3711
|
+
]
|
|
3712
|
+
}
|
|
3713
|
+
},
|
|
3685
3714
|
"/api/workspaces/{id}": {
|
|
3686
3715
|
"get": {
|
|
3687
3716
|
"operationId": "getWorkspaceById",
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "agent-relay-server",
|
|
3
|
-
"version": "0.11.
|
|
3
|
+
"version": "0.11.8",
|
|
4
4
|
"description": "Lightweight HTTP message relay for inter-agent communication across machines",
|
|
5
5
|
"module": "src/index.ts",
|
|
6
6
|
"type": "module",
|
|
@@ -39,14 +39,16 @@
|
|
|
39
39
|
"postinstall": "node scripts/install-bin-shim.cjs",
|
|
40
40
|
"start": "bun run src/index.ts",
|
|
41
41
|
"dev": "bun --watch run src/index.ts",
|
|
42
|
-
"dev:dashboard": "cd dashboard &&
|
|
42
|
+
"dev:dashboard": "cd dashboard && bun run dev",
|
|
43
43
|
"build:sdk": "cd sdk && bun run build",
|
|
44
|
-
"build:dashboard": "bun run build:sdk && cd dashboard &&
|
|
44
|
+
"build:dashboard": "bun run build:sdk && cd dashboard && bun run build",
|
|
45
45
|
"test": "bun test",
|
|
46
46
|
"smoke:spawn": "bun run scripts/orchestrator-spawn-smoke.ts",
|
|
47
47
|
"typecheck": "tsc --noEmit",
|
|
48
48
|
"docs:api": "bun run scripts/extract-api-spec.ts",
|
|
49
|
-
"docs:api:check": "bun run scripts/extract-api-spec.ts --check"
|
|
49
|
+
"docs:api:check": "bun run scripts/extract-api-spec.ts --check",
|
|
50
|
+
"release": "bun run scripts/release.ts",
|
|
51
|
+
"publish:ci": "bun run scripts/publish-ci.ts"
|
|
50
52
|
},
|
|
51
53
|
"keywords": [
|
|
52
54
|
"agent-relay",
|
package/src/config.ts
CHANGED
|
@@ -24,6 +24,10 @@ export const OFFLINE_PRUNE_MS = envPositiveInt("OFFLINE_PRUNE_MS", DAY_MS); // 2
|
|
|
24
24
|
export const REAP_INTERVAL_MS = envPositiveInt("REAP_INTERVAL_MS", 60_000); // reaper cadence
|
|
25
25
|
export const CLAIM_LEASE_MS = envPositiveInt("AGENT_RELAY_CLAIM_LEASE_MS", 1_800_000); // 30min claim lease
|
|
26
26
|
export const POOL_CLAIM_LEASE_MS = envPositiveInt("AGENT_RELAY_POOL_CLAIM_LEASE_MS", STALE_TTL_MS * 3); // pool binding lease
|
|
27
|
+
// Per-repo merge serialization lease — only one base merge may run at a time per
|
|
28
|
+
// repo. Held from when a workspace.merge command is dispatched until it settles
|
|
29
|
+
// (or this TTL expires, in case the orchestrator never reports back).
|
|
30
|
+
export const WORKSPACE_MERGE_LEASE_MS = envPositiveInt("AGENT_RELAY_WORKSPACE_MERGE_LEASE_MS", 900_000); // 15min
|
|
27
31
|
|
|
28
32
|
// Max body size for any POST/PATCH request (64 KiB).
|
|
29
33
|
export const MAX_BODY_BYTES = 64 * 1024;
|
package/src/db.ts
CHANGED
|
@@ -71,7 +71,7 @@ import type {
|
|
|
71
71
|
WorkspaceRecord,
|
|
72
72
|
WorkspaceStatus,
|
|
73
73
|
} from "./types";
|
|
74
|
-
import { STALE_TTL_MS, DAY_MS, CLAIM_LEASE_MS, POOL_CLAIM_LEASE_MS } from "./config";
|
|
74
|
+
import { STALE_TTL_MS, DAY_MS, CLAIM_LEASE_MS, POOL_CLAIM_LEASE_MS, WORKSPACE_MERGE_LEASE_MS } from "./config";
|
|
75
75
|
|
|
76
76
|
let db: Database;
|
|
77
77
|
const CONTEXT_SNAPSHOT_DEBOUNCE_MS = 60_000;
|
|
@@ -379,6 +379,31 @@ export function initDb(path: string = "agent-relay.db"): Database {
|
|
|
379
379
|
CREATE INDEX IF NOT EXISTS idx_workspaces_owner_agent ON workspaces(owner_agent_id);
|
|
380
380
|
CREATE INDEX IF NOT EXISTS idx_workspaces_policy ON workspaces(owner_policy_name);
|
|
381
381
|
|
|
382
|
+
-- Persistent per-repo steward record. Keyed to the repo, not a live agent, so
|
|
383
|
+
-- it survives a full all-agents-offline gap: steward_agent_id goes NULL
|
|
384
|
+
-- (dormant) while last_steward_agent_id preserves continuity, and the row is
|
|
385
|
+
-- re-filled when an agent rejoins the repo. This is the durable backing store
|
|
386
|
+
-- the steward column on workspace rows mirrors for display/maintenance.
|
|
387
|
+
CREATE TABLE IF NOT EXISTS repo_stewards (
|
|
388
|
+
repo_root TEXT PRIMARY KEY,
|
|
389
|
+
steward_agent_id TEXT,
|
|
390
|
+
last_steward_agent_id TEXT,
|
|
391
|
+
elected_at INTEGER,
|
|
392
|
+
updated_at INTEGER NOT NULL
|
|
393
|
+
);
|
|
394
|
+
|
|
395
|
+
-- Per-repo merge serialization lease. Exactly one base merge may be in flight
|
|
396
|
+
-- per repo; a second merge request is rejected until the holder settles or the
|
|
397
|
+
-- lease expires. Atomicity comes from the repo_root PRIMARY KEY + expiry guard.
|
|
398
|
+
CREATE TABLE IF NOT EXISTS workspace_merge_leases (
|
|
399
|
+
repo_root TEXT PRIMARY KEY,
|
|
400
|
+
workspace_id TEXT NOT NULL,
|
|
401
|
+
command_id TEXT,
|
|
402
|
+
holder TEXT,
|
|
403
|
+
acquired_at INTEGER NOT NULL,
|
|
404
|
+
expires_at INTEGER NOT NULL
|
|
405
|
+
);
|
|
406
|
+
|
|
382
407
|
CREATE TABLE IF NOT EXISTS tasks (
|
|
383
408
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
384
409
|
source TEXT NOT NULL,
|
|
@@ -1692,6 +1717,9 @@ export function upsertAgent(input: RegisterAgentInput): AgentCard {
|
|
|
1692
1717
|
const agent = getAgent(input.id)!;
|
|
1693
1718
|
if (agent.kind === "channel") upsertChannelForAgent(agent);
|
|
1694
1719
|
evaluatePoolBindings();
|
|
1720
|
+
// A (re)joining agent may revive a dormant repo steward — re-elect for the
|
|
1721
|
+
// repos it owns live workspaces in (issue #157, steward survives offline gap).
|
|
1722
|
+
if (agent.status !== "offline") electWorkspaceStewardsForAgent(agent.id);
|
|
1695
1723
|
return agent;
|
|
1696
1724
|
}
|
|
1697
1725
|
|
|
@@ -4140,13 +4168,16 @@ function relayConversationId(message: Message): string | undefined {
|
|
|
4140
4168
|
}
|
|
4141
4169
|
|
|
4142
4170
|
function isCoveredByLaterAgentResponse(message: Message, agentId: string): boolean {
|
|
4171
|
+
// Order by id, not created_at: ids are monotonic insertion order, so this is
|
|
4172
|
+
// robust when a reply lands in the same millisecond as the message it covers
|
|
4173
|
+
// (created_at > … strictly would miss it, leaving the message wrongly pending).
|
|
4143
4174
|
const replies = (db.prepare(`
|
|
4144
4175
|
${MSG_SELECT}
|
|
4145
4176
|
WHERE m.from_agent = ?
|
|
4146
|
-
AND m.
|
|
4147
|
-
ORDER BY m.
|
|
4177
|
+
AND m.id > ?
|
|
4178
|
+
ORDER BY m.id ASC
|
|
4148
4179
|
LIMIT 200
|
|
4149
|
-
`).all(agentId, message.
|
|
4180
|
+
`).all(agentId, message.id) as any[]).map(rowToMessage);
|
|
4150
4181
|
|
|
4151
4182
|
const conversationId = relayConversationId(message);
|
|
4152
4183
|
return replies.some((reply) => {
|
|
@@ -5108,31 +5139,203 @@ export function updateWorkspaceStatus(id: string, status: WorkspaceStatus, metad
|
|
|
5108
5139
|
return getWorkspace(id);
|
|
5109
5140
|
}
|
|
5110
5141
|
|
|
5142
|
+
// Workspace statuses that count as "live" for stewardship — an agent owning one
|
|
5143
|
+
// of these is a candidate steward; the repo is worth coordinating.
|
|
5144
|
+
const STEWARD_LIVE_STATUSES = "'active', 'ready', 'conflict', 'review_requested', 'merge_planned'";
|
|
5145
|
+
|
|
5146
|
+
export interface RepoStewardRecord {
|
|
5147
|
+
repoRoot: string;
|
|
5148
|
+
stewardAgentId?: string;
|
|
5149
|
+
lastStewardAgentId?: string;
|
|
5150
|
+
electedAt?: number;
|
|
5151
|
+
updatedAt: number;
|
|
5152
|
+
}
|
|
5153
|
+
|
|
5154
|
+
function rowToRepoSteward(row: any): RepoStewardRecord {
|
|
5155
|
+
return {
|
|
5156
|
+
repoRoot: row.repo_root,
|
|
5157
|
+
stewardAgentId: row.steward_agent_id ?? undefined,
|
|
5158
|
+
lastStewardAgentId: row.last_steward_agent_id ?? undefined,
|
|
5159
|
+
electedAt: row.elected_at ?? undefined,
|
|
5160
|
+
updatedAt: row.updated_at,
|
|
5161
|
+
};
|
|
5162
|
+
}
|
|
5163
|
+
|
|
5164
|
+
export function getRepoSteward(repoRoot: string): RepoStewardRecord | null {
|
|
5165
|
+
const row = db.prepare("SELECT * FROM repo_stewards WHERE repo_root = ?").get(repoRoot) as any;
|
|
5166
|
+
return row ? rowToRepoSteward(row) : null;
|
|
5167
|
+
}
|
|
5168
|
+
|
|
5169
|
+
export function listRepoStewards(): RepoStewardRecord[] {
|
|
5170
|
+
return (db.prepare("SELECT * FROM repo_stewards ORDER BY updated_at DESC").all() as any[]).map(rowToRepoSteward);
|
|
5171
|
+
}
|
|
5172
|
+
|
|
5173
|
+
// Persist the elected steward for a repo. The row is never deleted, so a repo's
|
|
5174
|
+
// stewardship survives a full all-agents-offline gap (steward goes NULL/dormant,
|
|
5175
|
+
// last_steward_agent_id keeps continuity) and resumes on the next agent join.
|
|
5176
|
+
function upsertRepoSteward(repoRoot: string, steward: string | null, now: number): void {
|
|
5177
|
+
db.prepare(`
|
|
5178
|
+
INSERT INTO repo_stewards (repo_root, steward_agent_id, last_steward_agent_id, elected_at, updated_at)
|
|
5179
|
+
VALUES ($repoRoot, $steward, $steward, $electedAt, $now)
|
|
5180
|
+
ON CONFLICT(repo_root) DO UPDATE SET
|
|
5181
|
+
steward_agent_id = $steward,
|
|
5182
|
+
last_steward_agent_id = coalesce($steward, repo_stewards.last_steward_agent_id),
|
|
5183
|
+
elected_at = CASE
|
|
5184
|
+
WHEN $steward IS NOT NULL AND $steward IS NOT repo_stewards.steward_agent_id THEN $now
|
|
5185
|
+
ELSE repo_stewards.elected_at
|
|
5186
|
+
END,
|
|
5187
|
+
updated_at = $now
|
|
5188
|
+
`).run({ $repoRoot: repoRoot, $steward: steward, $electedAt: steward ? now : null, $now: now });
|
|
5189
|
+
}
|
|
5190
|
+
|
|
5111
5191
|
function electWorkspaceStewards(repoRoot?: string): void {
|
|
5112
5192
|
const params: string[] = repoRoot ? [repoRoot] : [];
|
|
5113
5193
|
const repoRows = db.prepare(`
|
|
5114
5194
|
SELECT DISTINCT repo_root FROM workspaces
|
|
5115
|
-
WHERE status IN (
|
|
5195
|
+
WHERE status IN (${STEWARD_LIVE_STATUSES})
|
|
5116
5196
|
${repoRoot ? "AND repo_root = ?" : ""}
|
|
5117
5197
|
`).all(...params) as Array<{ repo_root: string }>;
|
|
5198
|
+
const now = Date.now();
|
|
5118
5199
|
for (const row of repoRows) {
|
|
5119
|
-
|
|
5120
|
-
|
|
5121
|
-
|
|
5122
|
-
|
|
5123
|
-
|
|
5124
|
-
|
|
5125
|
-
|
|
5126
|
-
|
|
5127
|
-
|
|
5128
|
-
|
|
5129
|
-
|
|
5130
|
-
|
|
5131
|
-
|
|
5132
|
-
|
|
5133
|
-
|
|
5134
|
-
|
|
5200
|
+
// Candidate pool: owners of live workspaces in this repo who are online,
|
|
5201
|
+
// oldest first. A steward must be an online agent actively in the repo — an
|
|
5202
|
+
// offline agent can't coordinate, so it is never elected (the old bug).
|
|
5203
|
+
const pool = (db.prepare(`
|
|
5204
|
+
SELECT w.owner_agent_id AS id, MIN(w.created_at) AS created_at
|
|
5205
|
+
FROM workspaces w JOIN agents a ON a.id = w.owner_agent_id
|
|
5206
|
+
WHERE w.repo_root = ? AND w.owner_agent_id IS NOT NULL
|
|
5207
|
+
AND a.status != 'offline' AND w.status IN (${STEWARD_LIVE_STATUSES})
|
|
5208
|
+
GROUP BY w.owner_agent_id
|
|
5209
|
+
ORDER BY created_at ASC
|
|
5210
|
+
`).all(row.repo_root) as Array<{ id: string }>).map((r) => r.id);
|
|
5211
|
+
|
|
5212
|
+
// Keep the current steward if it is still in the pool (stable election);
|
|
5213
|
+
// otherwise promote the oldest online owner, else go dormant (null).
|
|
5214
|
+
const current = getRepoSteward(row.repo_root)?.stewardAgentId ?? null;
|
|
5215
|
+
const steward = (current && pool.includes(current) ? current : pool[0]) ?? null;
|
|
5216
|
+
|
|
5217
|
+
upsertRepoSteward(row.repo_root, steward, now);
|
|
5218
|
+
// Mirror onto live workspace rows only when the steward actually changed, so
|
|
5219
|
+
// re-elections don't churn updated_at and reset the auto-abandon clock for a
|
|
5220
|
+
// dormant repo (a stranded review_requested must still age out).
|
|
5221
|
+
if (steward !== current) {
|
|
5222
|
+
db.prepare(`UPDATE workspaces SET steward_agent_id = ?, updated_at = ? WHERE repo_root = ? AND status IN (${STEWARD_LIVE_STATUSES})`)
|
|
5223
|
+
.run(steward, now, row.repo_root);
|
|
5224
|
+
}
|
|
5225
|
+
}
|
|
5226
|
+
}
|
|
5227
|
+
|
|
5228
|
+
// Public re-election trigger that does not change any workspace status — used by
|
|
5229
|
+
// maintenance to revive a dormant steward (e.g. on the next agent join) before
|
|
5230
|
+
// deciding whether a stranded worktree needs escalation.
|
|
5231
|
+
export function reelectRepoSteward(repoRoot: string): void {
|
|
5232
|
+
electWorkspaceStewards(repoRoot);
|
|
5233
|
+
}
|
|
5234
|
+
|
|
5235
|
+
// Merge a metadata patch into a workspace WITHOUT bumping updated_at or running a
|
|
5236
|
+
// steward election. For maintenance bookkeeping (stranded/escalation markers)
|
|
5237
|
+
// that must not disturb age-based GC timers. undefined values delete keys.
|
|
5238
|
+
export function patchWorkspaceMetadata(id: string, patch: Record<string, unknown>): WorkspaceRecord | null {
|
|
5239
|
+
const existing = getWorkspace(id);
|
|
5240
|
+
if (!existing) return null;
|
|
5241
|
+
const next = { ...existing.metadata };
|
|
5242
|
+
for (const [k, v] of Object.entries(patch)) {
|
|
5243
|
+
if (v === undefined) delete next[k];
|
|
5244
|
+
else next[k] = v;
|
|
5135
5245
|
}
|
|
5246
|
+
db.prepare("UPDATE workspaces SET metadata = ? WHERE id = ?").run(JSON.stringify(next), id);
|
|
5247
|
+
return getWorkspace(id);
|
|
5248
|
+
}
|
|
5249
|
+
|
|
5250
|
+
// Re-elect stewards for every repo where an agent owns a live workspace. Called
|
|
5251
|
+
// when an agent (re)registers so a dormant repo regains a steward on rejoin
|
|
5252
|
+
// without a full unscoped sweep.
|
|
5253
|
+
function electWorkspaceStewardsForAgent(agentId: string): void {
|
|
5254
|
+
const repos = db.prepare(`
|
|
5255
|
+
SELECT DISTINCT repo_root FROM workspaces
|
|
5256
|
+
WHERE owner_agent_id = ? AND status IN (${STEWARD_LIVE_STATUSES})
|
|
5257
|
+
`).all(agentId) as Array<{ repo_root: string }>;
|
|
5258
|
+
for (const r of repos) electWorkspaceStewards(r.repo_root);
|
|
5259
|
+
}
|
|
5260
|
+
|
|
5261
|
+
// --- Per-repo merge serialization lease (issue #157) -----------------------
|
|
5262
|
+
|
|
5263
|
+
export interface MergeLeaseRecord {
|
|
5264
|
+
repoRoot: string;
|
|
5265
|
+
workspaceId: string;
|
|
5266
|
+
commandId?: string;
|
|
5267
|
+
holder?: string;
|
|
5268
|
+
acquiredAt: number;
|
|
5269
|
+
expiresAt: number;
|
|
5270
|
+
}
|
|
5271
|
+
|
|
5272
|
+
function rowToMergeLease(row: any): MergeLeaseRecord {
|
|
5273
|
+
return {
|
|
5274
|
+
repoRoot: row.repo_root,
|
|
5275
|
+
workspaceId: row.workspace_id,
|
|
5276
|
+
commandId: row.command_id ?? undefined,
|
|
5277
|
+
holder: row.holder ?? undefined,
|
|
5278
|
+
acquiredAt: row.acquired_at,
|
|
5279
|
+
expiresAt: row.expires_at,
|
|
5280
|
+
};
|
|
5281
|
+
}
|
|
5282
|
+
|
|
5283
|
+
export function getMergeLease(repoRoot: string): MergeLeaseRecord | null {
|
|
5284
|
+
const row = db.prepare("SELECT * FROM workspace_merge_leases WHERE repo_root = ?").get(repoRoot) as any;
|
|
5285
|
+
return row ? rowToMergeLease(row) : null;
|
|
5286
|
+
}
|
|
5287
|
+
|
|
5288
|
+
export function listMergeLeases(): MergeLeaseRecord[] {
|
|
5289
|
+
return (db.prepare("SELECT * FROM workspace_merge_leases ORDER BY acquired_at DESC").all() as any[]).map(rowToMergeLease);
|
|
5290
|
+
}
|
|
5291
|
+
|
|
5292
|
+
export function releaseExpiredMergeLeases(now: number = Date.now()): string[] {
|
|
5293
|
+
const expired = db.prepare("SELECT repo_root FROM workspace_merge_leases WHERE expires_at <= ?").all(now) as Array<{ repo_root: string }>;
|
|
5294
|
+
if (!expired.length) return [];
|
|
5295
|
+
db.prepare("DELETE FROM workspace_merge_leases WHERE expires_at <= ?").run(now);
|
|
5296
|
+
return expired.map((r) => r.repo_root);
|
|
5297
|
+
}
|
|
5298
|
+
|
|
5299
|
+
// Atomically acquire the per-repo merge lease. Succeeds if no live lease is held
|
|
5300
|
+
// for the repo (or the existing one has expired). Serialized via db.transaction
|
|
5301
|
+
// so two concurrent merge requests for the same repo can't both win.
|
|
5302
|
+
export function acquireMergeLease(
|
|
5303
|
+
repoRoot: string,
|
|
5304
|
+
workspaceId: string,
|
|
5305
|
+
holder?: string,
|
|
5306
|
+
): { ok: true; lease: MergeLeaseRecord } | { ok: false; lease: MergeLeaseRecord } {
|
|
5307
|
+
return db.transaction(() => {
|
|
5308
|
+
const now = Date.now();
|
|
5309
|
+
const existing = getMergeLease(repoRoot);
|
|
5310
|
+
if (existing && existing.expiresAt > now) return { ok: false as const, lease: existing };
|
|
5311
|
+
const expiresAt = now + WORKSPACE_MERGE_LEASE_MS;
|
|
5312
|
+
db.prepare(`
|
|
5313
|
+
INSERT INTO workspace_merge_leases (repo_root, workspace_id, command_id, holder, acquired_at, expires_at)
|
|
5314
|
+
VALUES (?, ?, NULL, ?, ?, ?)
|
|
5315
|
+
ON CONFLICT(repo_root) DO UPDATE SET
|
|
5316
|
+
workspace_id = excluded.workspace_id, command_id = NULL, holder = excluded.holder,
|
|
5317
|
+
acquired_at = excluded.acquired_at, expires_at = excluded.expires_at
|
|
5318
|
+
`).run(repoRoot, workspaceId, holder ?? null, now, expiresAt);
|
|
5319
|
+
return { ok: true as const, lease: getMergeLease(repoRoot)! };
|
|
5320
|
+
})();
|
|
5321
|
+
}
|
|
5322
|
+
|
|
5323
|
+
// Attach the dispatched command id to a held lease so it can be released by
|
|
5324
|
+
// command id when the merge settles.
|
|
5325
|
+
export function setMergeLeaseCommand(repoRoot: string, commandId: string): void {
|
|
5326
|
+
db.prepare("UPDATE workspace_merge_leases SET command_id = ? WHERE repo_root = ?").run(commandId, repoRoot);
|
|
5327
|
+
}
|
|
5328
|
+
|
|
5329
|
+
// Release a merge lease. Guard by commandId/workspaceId when known so a stale
|
|
5330
|
+
// release can't drop a newer lease for the same repo.
|
|
5331
|
+
export function releaseMergeLease(opts: { repoRoot?: string; commandId?: string; workspaceId?: string }): boolean {
|
|
5332
|
+
const where: string[] = [];
|
|
5333
|
+
const params: string[] = [];
|
|
5334
|
+
if (opts.repoRoot) { where.push("repo_root = ?"); params.push(opts.repoRoot); }
|
|
5335
|
+
if (opts.commandId) { where.push("command_id = ?"); params.push(opts.commandId); }
|
|
5336
|
+
if (opts.workspaceId) { where.push("workspace_id = ?"); params.push(opts.workspaceId); }
|
|
5337
|
+
if (!where.length) return false;
|
|
5338
|
+
return db.prepare(`DELETE FROM workspace_merge_leases WHERE ${where.join(" AND ")}`).run(...params).changes > 0;
|
|
5136
5339
|
}
|
|
5137
5340
|
|
|
5138
5341
|
export function deleteOrchestrator(id: string): boolean {
|
package/src/maintenance.ts
CHANGED
|
@@ -9,16 +9,22 @@ import {
|
|
|
9
9
|
createActivityEvent,
|
|
10
10
|
evaluatePoolBindings,
|
|
11
11
|
expireQueuedMessages,
|
|
12
|
+
getAgent,
|
|
12
13
|
getDb,
|
|
14
|
+
getRepoSteward,
|
|
15
|
+
getWorkspace,
|
|
13
16
|
listOrchestrators,
|
|
14
17
|
listWorkspaces,
|
|
18
|
+
patchWorkspaceMetadata,
|
|
15
19
|
pruneOfflineAgents,
|
|
16
20
|
pruneOldMessages,
|
|
17
21
|
deleteWorkspace,
|
|
18
22
|
pruneOrphanedSharedWorkspaces,
|
|
19
23
|
reapStaleAgents,
|
|
20
24
|
reapStaleOrchestrators,
|
|
25
|
+
reelectRepoSteward,
|
|
21
26
|
releaseExpiredClaims,
|
|
27
|
+
releaseExpiredMergeLeases,
|
|
22
28
|
releaseOrphanedTasks,
|
|
23
29
|
sendMessage,
|
|
24
30
|
sweepArtifacts,
|
|
@@ -49,6 +55,15 @@ const CONFLICT_SCAN_INTERVAL_MS = Number(process.env.AGENT_RELAY_CONFLICT_SCAN_I
|
|
|
49
55
|
const WORKSPACE_RETENTION_MS = Number(process.env.AGENT_RELAY_WORKSPACE_RETENTION_MS) || DAY_MS;
|
|
50
56
|
const WORKSPACE_REVIEW_TTL_MS = Number(process.env.AGENT_RELAY_WORKSPACE_REVIEW_TTL_MS) || 3 * DAY_MS;
|
|
51
57
|
const WORKSPACE_GC_INTERVAL_MS = Number(process.env.AGENT_RELAY_WORKSPACE_GC_INTERVAL_MS) || 60 * 60 * 1000;
|
|
58
|
+
// How long a stranded review_requested/conflict worktree (no online steward) may
|
|
59
|
+
// sit before escalating to the configured fallback target, and the durable
|
|
60
|
+
// escalation target itself (`policy:<name>`, `label:<name>`, `cap:<name>`, an
|
|
61
|
+
// agent id, or `broadcast`). Read at call-time so config changes take effect
|
|
62
|
+
// without a restart (issue #157).
|
|
63
|
+
const stewardEscalationMs = () => Number(process.env.AGENT_RELAY_WORKSPACE_STEWARD_ESCALATION_MS) || 60 * 60 * 1000;
|
|
64
|
+
const stewardFallbackTarget = () => (process.env.AGENT_RELAY_WORKSPACE_STEWARD_FALLBACK || "").trim();
|
|
65
|
+
// Statuses that need an owner — a stranded one of these is what escalation rescues.
|
|
66
|
+
const STRANDABLE_STATUSES = new Set<WorkspaceStatus>(["review_requested", "conflict"]);
|
|
52
67
|
// Live statuses worth scanning. Terminal (cleaned/merged/abandoned) and
|
|
53
68
|
// in-flight (cleanup_requested) states are skipped.
|
|
54
69
|
const CONFLICT_SCAN_STATUSES = new Set<WorkspaceStatus>(["active", "ready", "review_requested", "merge_planned", "conflict"]);
|
|
@@ -413,11 +428,27 @@ async function scanWorkspaceConflicts(): Promise<Record<string, unknown>> {
|
|
|
413
428
|
return { scanned: candidates.length, flagged, cleared, notifiedStewards };
|
|
414
429
|
}
|
|
415
430
|
|
|
431
|
+
// Send a system DM, swallowing failures (a stale/missing/misconfigured target
|
|
432
|
+
// must never break the GC sweep). Returns the target on success, null otherwise.
|
|
433
|
+
function notifyTarget(target: string, subject: string, body: string, payload: Record<string, unknown>): string | null {
|
|
434
|
+
if (!target) return null;
|
|
435
|
+
try {
|
|
436
|
+
emitNewMessage(sendMessage({ from: "system", to: target, kind: "system", subject, body, payload }));
|
|
437
|
+
return target;
|
|
438
|
+
} catch {
|
|
439
|
+
return null;
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
416
443
|
async function workspaceGC(): Promise<Record<string, unknown>> {
|
|
417
444
|
const now = Date.now();
|
|
418
445
|
const cutoff = now - WORKSPACE_RETENTION_MS;
|
|
419
446
|
const reviewCutoff = now - WORKSPACE_REVIEW_TTL_MS;
|
|
420
447
|
|
|
448
|
+
// 0. Free any merge leases whose holder never reported back (orchestrator died
|
|
449
|
+
// mid-merge). The lease TTL is the safety net; this just reclaims them eagerly.
|
|
450
|
+
const releasedLeaseRepos = releaseExpiredMergeLeases(now);
|
|
451
|
+
|
|
421
452
|
// 1. Prune terminal rows past retention
|
|
422
453
|
const all = listWorkspaces();
|
|
423
454
|
const terminalIds: string[] = [];
|
|
@@ -428,29 +459,84 @@ async function workspaceGC(): Promise<Record<string, unknown>> {
|
|
|
428
459
|
}
|
|
429
460
|
}
|
|
430
461
|
|
|
431
|
-
// 2.
|
|
462
|
+
// 2. Rescue stranded review_requested/conflict worktrees (issue #157). A
|
|
463
|
+
// worktree is "stranded" when its steward is gone (all repo agents offline).
|
|
464
|
+
// Re-elect first — an agent may have rejoined — and hand off to the new
|
|
465
|
+
// steward; if none can be elected past the TTL, escalate to the fallback
|
|
466
|
+
// target so it never rots in silence. Bookkeeping uses patchWorkspaceMetadata
|
|
467
|
+
// (no updated_at bump) so the auto-abandon clock below keeps ticking.
|
|
468
|
+
const escalatedIds: string[] = [];
|
|
469
|
+
const reassignedIds: string[] = [];
|
|
470
|
+
const escalationTargets: string[] = [];
|
|
471
|
+
const escalationMs = stewardEscalationMs();
|
|
472
|
+
const fallbackTarget = stewardFallbackTarget();
|
|
473
|
+
for (const ws of all) {
|
|
474
|
+
if (!STRANDABLE_STATUSES.has(ws.status) || ws.mode !== "isolated" || !ws.worktreePath) continue;
|
|
475
|
+
reelectRepoSteward(ws.repoRoot);
|
|
476
|
+
const fresh = getWorkspace(ws.id);
|
|
477
|
+
if (!fresh || !STRANDABLE_STATUSES.has(fresh.status)) continue;
|
|
478
|
+
const meta = fresh.metadata as Record<string, unknown>;
|
|
479
|
+
const steward = fresh.stewardAgentId;
|
|
480
|
+
const stewardOnline = Boolean(steward && getAgent(steward) && getAgent(steward)!.status !== "offline");
|
|
481
|
+
const strandedAt = typeof meta.strandedAt === "number" ? meta.strandedAt : undefined;
|
|
482
|
+
|
|
483
|
+
if (stewardOnline) {
|
|
484
|
+
// An online steward owns it. If it was previously stranded and this
|
|
485
|
+
// steward hasn't been told, hand it off explicitly, then clear markers.
|
|
486
|
+
if (strandedAt !== undefined && meta.strandedNotifiedSteward !== steward) {
|
|
487
|
+
const sent = notifyTarget(
|
|
488
|
+
steward!,
|
|
489
|
+
"Workspace stewardship reassigned",
|
|
490
|
+
`You are now steward for ${fresh.repoRoot}. Workspace \`${fresh.branch ?? fresh.id}\` is ${fresh.status} and was stranded without an online steward — please coordinate ${fresh.status === "conflict" ? "conflict resolution" : "review/merge"}.`,
|
|
491
|
+
{ kind: "workspace.steward-reassigned", workspaceId: fresh.id, repoRoot: fresh.repoRoot, branch: fresh.branch, status: fresh.status },
|
|
492
|
+
);
|
|
493
|
+
if (sent) reassignedIds.push(fresh.id);
|
|
494
|
+
}
|
|
495
|
+
patchWorkspaceMetadata(fresh.id, { strandedAt: undefined, escalatedAt: undefined, strandedNotifiedSteward: steward });
|
|
496
|
+
continue;
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
// Stranded: no online steward could be elected.
|
|
500
|
+
if (strandedAt === undefined) { patchWorkspaceMetadata(fresh.id, { strandedAt: now }); continue; }
|
|
501
|
+
if (now - strandedAt < escalationMs || meta.escalatedAt) continue;
|
|
502
|
+
const sent = notifyTarget(
|
|
503
|
+
fallbackTarget,
|
|
504
|
+
"Stranded workspace needs an owner",
|
|
505
|
+
`Workspace \`${fresh.branch ?? fresh.id}\` in ${fresh.repoRoot} is ${fresh.status} with no online steward (all repo agents offline) for ${Math.round((now - strandedAt) / (60 * 60 * 1000))}h. Please coordinate ${fresh.status === "conflict" ? "conflict resolution" : "review/merge"} or clean up the worktree.`,
|
|
506
|
+
{ kind: "workspace.stranded-escalation", workspaceId: fresh.id, repoRoot: fresh.repoRoot, branch: fresh.branch, status: fresh.status, strandedAt },
|
|
507
|
+
);
|
|
508
|
+
if (sent) escalationTargets.push(sent);
|
|
509
|
+
patchWorkspaceMetadata(fresh.id, { escalatedAt: now });
|
|
510
|
+
escalatedIds.push(fresh.id);
|
|
511
|
+
createActivityEvent({
|
|
512
|
+
clientId: `workspace-gc-escalate-${fresh.id}-${now}`,
|
|
513
|
+
kind: "state",
|
|
514
|
+
title: "Workspace escalated",
|
|
515
|
+
body: `${fresh.branch ?? fresh.id} in ${fresh.repoRoot} — stranded ${fresh.status} escalated${fallbackTarget ? ` to ${fallbackTarget}` : " (no fallback configured)"}`,
|
|
516
|
+
meta: fresh.branch ?? fresh.id,
|
|
517
|
+
icon: "ti-alert-octagon",
|
|
518
|
+
view: "orchestrators",
|
|
519
|
+
metadata: { source: "server", maintenanceJobId: "workspace-gc", workspaceId: fresh.id, fallback: fallbackTarget || null },
|
|
520
|
+
});
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
// 3. Auto-abandon stale review_requested worktrees
|
|
432
524
|
const abandonedIds: string[] = [];
|
|
433
525
|
const notifiedStewards: string[] = [];
|
|
434
526
|
for (const ws of all) {
|
|
435
527
|
if (ws.status === "review_requested" && ws.updatedAt < reviewCutoff) {
|
|
436
528
|
updateWorkspaceStatus(ws.id, "abandoned", { autoAbandoned: true, abandonedReason: "review_requested TTL exceeded", abandonedAt: now });
|
|
437
529
|
abandonedIds.push(ws.id);
|
|
438
|
-
if
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
emitNewMessage(msg);
|
|
449
|
-
notifiedStewards.push(ws.stewardAgentId);
|
|
450
|
-
} catch {
|
|
451
|
-
// Steward gone — activity event is enough.
|
|
452
|
-
}
|
|
453
|
-
}
|
|
530
|
+
// Notify the steward if one exists, else the configured fallback so a
|
|
531
|
+
// stranded abandon isn't silent (issue #157).
|
|
532
|
+
const target = ws.stewardAgentId ?? fallbackTarget;
|
|
533
|
+
const sent = notifyTarget(
|
|
534
|
+
target,
|
|
535
|
+
"Workspace auto-abandoned",
|
|
536
|
+
`Workspace \`${ws.branch ?? ws.id}\` in ${ws.repoRoot} was auto-abandoned after ${Math.round(WORKSPACE_REVIEW_TTL_MS / DAY_MS)}d without steward action. Run workspace cleanup to reclaim the worktree.`,
|
|
537
|
+
{ kind: "workspace.auto-abandoned", workspaceId: ws.id, repoRoot: ws.repoRoot, branch: ws.branch },
|
|
538
|
+
);
|
|
539
|
+
if (sent) notifiedStewards.push(sent);
|
|
454
540
|
createActivityEvent({
|
|
455
541
|
clientId: `workspace-gc-abandon-${ws.id}-${now}`,
|
|
456
542
|
kind: "state",
|
|
@@ -483,7 +569,16 @@ async function workspaceGC(): Promise<Record<string, unknown>> {
|
|
|
483
569
|
pruneCommands.push(command.id);
|
|
484
570
|
}
|
|
485
571
|
|
|
486
|
-
return {
|
|
572
|
+
return {
|
|
573
|
+
prunedTerminal: terminalIds,
|
|
574
|
+
autoAbandoned: abandonedIds,
|
|
575
|
+
notifiedStewards,
|
|
576
|
+
pruneCommands,
|
|
577
|
+
releasedLeaseRepos,
|
|
578
|
+
escalated: escalatedIds,
|
|
579
|
+
reassigned: reassignedIds,
|
|
580
|
+
escalationTargets,
|
|
581
|
+
};
|
|
487
582
|
}
|
|
488
583
|
|
|
489
584
|
let timer: Timer | null = null;
|
package/src/routes.ts
CHANGED
|
@@ -85,6 +85,11 @@ import {
|
|
|
85
85
|
getWorkspace,
|
|
86
86
|
listWorkspaces,
|
|
87
87
|
updateWorkspaceStatus,
|
|
88
|
+
acquireMergeLease,
|
|
89
|
+
setMergeLeaseCommand,
|
|
90
|
+
releaseMergeLease,
|
|
91
|
+
listRepoStewards,
|
|
92
|
+
listMergeLeases,
|
|
88
93
|
deleteWorkspace,
|
|
89
94
|
deleteOrchestrator,
|
|
90
95
|
evaluatePoolBindings,
|
|
@@ -3799,6 +3804,10 @@ const getWorkspaceById: Handler = (_req, params) => {
|
|
|
3799
3804
|
return json(workspace);
|
|
3800
3805
|
};
|
|
3801
3806
|
|
|
3807
|
+
// Per-repo coordination state: persistent steward records (survive offline gaps)
|
|
3808
|
+
// and in-flight merge serialization leases (issue #157).
|
|
3809
|
+
const getWorkspaceStewards: Handler = () => json({ stewards: listRepoStewards(), mergeLeases: listMergeLeases() });
|
|
3810
|
+
|
|
3802
3811
|
// Proxy a read-only workspace interrogation to the owning orchestrator's host
|
|
3803
3812
|
// API. Degrades to { available: false } rather than erroring so the dashboard
|
|
3804
3813
|
// can render a placeholder when the host is offline or there's no worktree.
|
|
@@ -3931,6 +3940,7 @@ const postWorkspaceOrphanReclaim: Handler = async (req) => {
|
|
|
3931
3940
|
const postWorkspaceAction: Handler = async (req, params) => {
|
|
3932
3941
|
const parsed = await parseBody<unknown>(req);
|
|
3933
3942
|
if (!parsed.ok) return error(parsed.error, parsed.status);
|
|
3943
|
+
let mergeLeaseRepo: string | undefined;
|
|
3934
3944
|
try {
|
|
3935
3945
|
if (!isRecord(parsed.body)) return error("body required");
|
|
3936
3946
|
const workspace = getWorkspace(params.id!);
|
|
@@ -3949,6 +3959,18 @@ const postWorkspaceAction: Handler = async (req, params) => {
|
|
|
3949
3959
|
const denied = authorizeRoute(req, { scope: requiresCommand ? "command:write" : "agent:write", resource: { agentId, cwd: workspace.worktreePath } });
|
|
3950
3960
|
if (denied) return denied;
|
|
3951
3961
|
if (action === "status") return json(workspace);
|
|
3962
|
+
// Serialize base merges per repo: acquire the merge lease BEFORE mutating
|
|
3963
|
+
// status so a losing request leaves the workspace untouched (issue #157).
|
|
3964
|
+
if (action === "merge") {
|
|
3965
|
+
const lease = acquireMergeLease(workspace.repoRoot, workspace.id, agentId ?? "dashboard");
|
|
3966
|
+
if (!lease.ok) {
|
|
3967
|
+
return error(
|
|
3968
|
+
`a merge is already in progress for ${workspace.repoRoot} (workspace ${lease.lease.workspaceId}); retry after it settles`,
|
|
3969
|
+
409,
|
|
3970
|
+
);
|
|
3971
|
+
}
|
|
3972
|
+
mergeLeaseRepo = workspace.repoRoot;
|
|
3973
|
+
}
|
|
3952
3974
|
const statusByAction: Record<string, WorkspaceStatus | undefined> = {
|
|
3953
3975
|
status: undefined,
|
|
3954
3976
|
ready: "ready",
|
|
@@ -3982,7 +4004,11 @@ const postWorkspaceAction: Handler = async (req, params) => {
|
|
|
3982
4004
|
};
|
|
3983
4005
|
if (action === "merge") {
|
|
3984
4006
|
// Merge needs a live host: rebasing against a stale base later is unsafe.
|
|
3985
|
-
if (!onlineOwner)
|
|
4007
|
+
if (!onlineOwner) {
|
|
4008
|
+
releaseMergeLease({ repoRoot: workspace.repoRoot, workspaceId: workspace.id });
|
|
4009
|
+
mergeLeaseRepo = undefined;
|
|
4010
|
+
return error("no online orchestrator available for workspace merge", 409);
|
|
4011
|
+
}
|
|
3986
4012
|
const strategy = cleanEnum(parsed.body.strategy, "strategy", ["pr", "rebase-ff", "auto"] as const, "auto");
|
|
3987
4013
|
command = createCommand({
|
|
3988
4014
|
type: "workspace.merge",
|
|
@@ -4014,6 +4040,9 @@ const postWorkspaceAction: Handler = async (req, params) => {
|
|
|
4014
4040
|
params: { action: "cleanup", ...baseParams, deleteBranch: true, queued: owner.status !== "online" },
|
|
4015
4041
|
});
|
|
4016
4042
|
}
|
|
4043
|
+
// Bind the lease to the dispatched merge command so it's released by id
|
|
4044
|
+
// when the command settles (postCommandResult).
|
|
4045
|
+
if (action === "merge" && mergeLeaseRepo) setMergeLeaseCommand(mergeLeaseRepo, command.id);
|
|
4017
4046
|
emitCommand(command);
|
|
4018
4047
|
}
|
|
4019
4048
|
auditEvent({
|
|
@@ -4029,6 +4058,9 @@ const postWorkspaceAction: Handler = async (req, params) => {
|
|
|
4029
4058
|
});
|
|
4030
4059
|
return json({ workspace: updated, command }, requiresCommand ? 202 : 200);
|
|
4031
4060
|
} catch (e) {
|
|
4061
|
+
// A merge that acquired the lease but failed before dispatch must release it,
|
|
4062
|
+
// or the repo stays blocked until the TTL expires.
|
|
4063
|
+
if (mergeLeaseRepo) releaseMergeLease({ repoRoot: mergeLeaseRepo });
|
|
4032
4064
|
if (e instanceof ValidationError) return error(e.message, 400);
|
|
4033
4065
|
throw e;
|
|
4034
4066
|
}
|
|
@@ -4336,6 +4368,11 @@ const patchCommand: Handler = async (req, params) => {
|
|
|
4336
4368
|
}
|
|
4337
4369
|
}
|
|
4338
4370
|
if (command.type === "workspace.merge") {
|
|
4371
|
+
// Merge settled (either way) — free the per-repo merge lease so the next
|
|
4372
|
+
// base merge can proceed (issue #157).
|
|
4373
|
+
if (command.status === "succeeded" || command.status === "failed") {
|
|
4374
|
+
releaseMergeLease({ commandId: command.id });
|
|
4375
|
+
}
|
|
4339
4376
|
if (command.status === "succeeded" && isRecord(command.result)) {
|
|
4340
4377
|
const workspaceId = cleanString(command.result.workspaceId, "result.workspaceId", { max: 160 });
|
|
4341
4378
|
const resultStatus = cleanEnum(command.result.status, "result.status", VALID_WORKSPACE_STATUSES) as WorkspaceStatus | undefined;
|
|
@@ -6259,6 +6296,7 @@ const routes: Route[] = [
|
|
|
6259
6296
|
// Static segments before :id so "/workspaces/orphans" isn't captured as an id.
|
|
6260
6297
|
route("GET", "/api/workspaces/orphans", getWorkspaceOrphans),
|
|
6261
6298
|
route("POST", "/api/workspaces/orphans/reclaim", postWorkspaceOrphanReclaim),
|
|
6299
|
+
route("GET", "/api/workspaces/stewards", getWorkspaceStewards),
|
|
6262
6300
|
route("GET", "/api/workspaces/:id", getWorkspaceById),
|
|
6263
6301
|
route("GET", "/api/workspaces/:id/git-state", getWorkspaceGitState),
|
|
6264
6302
|
route("GET", "/api/workspaces/:id/merge-preview", getWorkspaceMergePreview),
|