@desplega.ai/agent-swarm 1.53.0 → 1.54.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -58,6 +58,8 @@ Agent Swarm lets you run a team of AI coding agents that coordinate autonomously
58
58
  - **Onboarding wizard** — Interactive CLI wizard (`agent-swarm onboard`) to set up a new swarm from scratch with presets, credential collection, and docker-compose generation
59
59
  - **Skill system** — Reusable procedural knowledge: create, install, publish, and sync skills from GitHub with scope resolution (agent → swarm → global)
60
60
  - **Human-in-the-Loop** — Workflow nodes that pause for human approval or input, with a dashboard UI for reviewing and responding to requests
61
+ - **MCP server management** — Register, install, and manage MCP servers for agents with scope cascade (agent → swarm → global) and auto-injection into worker containers
62
+ - **Context usage tracking** — Monitor context window utilization and compaction events per task with visual indicators in the dashboard
61
63
 
62
64
  ## Quick Start
63
65
 
package/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "Agent Swarm API",
5
- "version": "1.52.1",
5
+ "version": "1.53.1",
6
6
  "description": "Multi-agent orchestration API for Claude Code, Codex, and Gemini CLI. Enables task distribution, agent communication, and service discovery.\n\nMCP tools are documented separately in [MCP.md](./MCP.md)."
7
7
  },
8
8
  "servers": [
@@ -2424,6 +2424,27 @@
2424
2424
  }
2425
2425
  }
2426
2426
  },
2427
+ "/api/heartbeat/sweep": {
2428
+ "post": {
2429
+ "summary": "Trigger an immediate heartbeat sweep",
2430
+ "tags": [
2431
+ "Heartbeat"
2432
+ ],
2433
+ "security": [
2434
+ {
2435
+ "bearerAuth": []
2436
+ }
2437
+ ],
2438
+ "responses": {
2439
+ "200": {
2440
+ "description": "Sweep completed successfully"
2441
+ },
2442
+ "401": {
2443
+ "description": "Unauthorized"
2444
+ }
2445
+ }
2446
+ }
2447
+ },
2427
2448
  "/api/memory/index": {
2428
2449
  "post": {
2429
2450
  "summary": "Ingest content into memory system (async embedding)",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@desplega.ai/agent-swarm",
3
- "version": "1.53.0",
3
+ "version": "1.54.1",
4
4
  "description": "Multi-agent orchestration for Claude Code, Codex, Gemini CLI, and other AI coding assistants",
5
5
  "license": "MIT",
6
6
  "author": "desplega.sh <contact@desplega.sh>",
@@ -21,11 +21,17 @@ Once you have the task details, you should:
21
21
  - Use `memory-get` on any highly relevant results to get full details
22
22
  - This step is NOT optional. Past learnings compound your effectiveness.
23
23
  <!-- /claude-only -->
24
- 2. Figure out if you need to use any of the available commands to help you with your work (see below for available commands)
25
- 2. Use the `/todos` command to add a new todo item indicating you are starting to work on the task (e.g. "Work on task XXX: <short description>"). This will help on restarts, as it will be easier to remember what you were doing.
26
- 3. Call `store-progress` tool to mark the task as "in-progress" with a progress set to something like "Starting work on the task XXX, blah blah". Additionally use `/swarm-chat` command to notify the swarm, human and lead when applicable. Do not be too verbose, nor spammy.
27
- 4. Start working on the task, providing updates as needed by calling `store-progress` tool, use the `progress` field to indicate what you are doing.
28
- 5. Once you either done or in a dead-end, see the "Completion" section below.
24
+ 2. **Check Installed Skills (REQUIRED):** Before researching or implementing, review your "Installed Skills" section in the system prompt:
25
+ - If any skill's description or trigger matches this task, invoke it via the `Skill` tool BEFORE doing manual research
26
+ - Skills contain pre-built, tested procedures that save context window and cost
27
+ - Example: task involves Linear use `linear-interaction` skill, task involves email use `agentmail-sending` skill
28
+ - Only proceed to manual research/web search if NO installed skill covers the task
29
+ - This step is NOT optional. Skipping it wastes context and money.
30
+ 3. Figure out if you need to use any of the available commands to help you with your work (see below for available commands)
31
+ 4. Use the `/todos` command to add a new todo item indicating you are starting to work on the task (e.g. "Work on task XXX: <short description>"). This will help on restarts, as it will be easier to remember what you were doing.
32
+ 5. Call `store-progress` tool to mark the task as "in-progress" with a progress set to something like "Starting work on the task XXX, blah blah". Additionally use `/swarm-chat` command to notify the swarm, human and lead when applicable. Do not be too verbose, nor spammy.
33
+ 6. Start working on the task, providing updates as needed by calling `store-progress` tool, use the `progress` field to indicate what you are doing.
34
+ 7. Once you either done or in a dead-end, see the "Completion" section below.
29
35
 
30
36
  ### Available commands
31
37
 
@@ -13,11 +13,17 @@ Once you get a task assigned, you need to immediately start working on it. To do
13
13
 
14
14
  Once you have the task details, you should:
15
15
 
16
- 1. Figure out if you need to perform any research or planning before starting (see below)
17
- 2. Use the `/skill:todos` to add a new todo item indicating you are starting to work on the task (e.g. "Work on task XXX: <short description>"). This will help on restarts, as it will be easier to remember what you were doing.
18
- 3. Call `store-progress` tool to mark the task as "in-progress" with a progress set to something like "Starting work on the task XXX, blah blah". Additionally use `/skill:swarm-chat` to notify the swarm, human and lead when applicable. Do not be too verbose, nor spammy.
19
- 4. Start working on the task, providing updates as needed by calling `store-progress` tool, use the `progress` field to indicate what you are doing.
20
- 5. Once you either done or in a dead-end, see the "Completion" section below.
16
+ 1. **Check Installed Skills (REQUIRED):** Before researching or implementing, review your "Installed Skills" section in the system prompt:
17
+ - If any skill's description or trigger matches this task, invoke it via the `Skill` tool BEFORE doing manual research
18
+ - Skills contain pre-built, tested procedures that save context window and cost
19
+ - Example: task involves Linear use `linear-interaction` skill, task involves email use `agentmail-sending` skill
20
+ - Only proceed to manual research/web search if NO installed skill covers the task
21
+ - This step is NOT optional. Skipping it wastes context and money.
22
+ 2. Figure out if you need to perform any research or planning before starting (see below)
23
+ 3. Use the `/skill:todos` to add a new todo item indicating you are starting to work on the task (e.g. "Work on task XXX: <short description>"). This will help on restarts, as it will be easier to remember what you were doing.
24
+ 4. Call `store-progress` tool to mark the task as "in-progress" with a progress set to something like "Starting work on the task XXX, blah blah". Additionally use `/skill:swarm-chat` to notify the swarm, human and lead when applicable. Do not be too verbose, nor spammy.
25
+ 5. Start working on the task, providing updates as needed by calling `store-progress` tool, use the `progress` field to indicate what you are doing.
26
+ 6. Once you either done or in a dead-end, see the "Completion" section below.
21
27
 
22
28
  ### Research and Planning
23
29
 
package/src/be/db.ts CHANGED
@@ -728,6 +728,7 @@ type AgentTaskRow = {
728
728
  peakContextPercent: number | null;
729
729
  totalContextTokensUsed: number | null;
730
730
  contextWindowSize: number | null;
731
+ was_paused: number;
731
732
  };
732
733
 
733
734
  function rowToAgentTask(row: AgentTaskRow): AgentTask {
@@ -781,6 +782,7 @@ function rowToAgentTask(row: AgentTaskRow): AgentTask {
781
782
  failureReason: row.failureReason ?? undefined,
782
783
  output: row.output ?? undefined,
783
784
  progress: row.progress ?? undefined,
785
+ wasPaused: !!row.was_paused,
784
786
  };
785
787
  }
786
788
 
@@ -1509,6 +1511,7 @@ export function pauseTask(id: string): AgentTask | null {
1509
1511
  .prepare<AgentTaskRow, [string]>(
1510
1512
  `UPDATE agent_tasks
1511
1513
  SET status = 'paused',
1514
+ was_paused = 1,
1512
1515
  lastUpdatedAt = strftime('%Y-%m-%dT%H:%M:%fZ', 'now')
1513
1516
  WHERE id = ? AND status = 'in_progress'
1514
1517
  RETURNING *`,
@@ -1543,6 +1546,7 @@ export function resumeTask(taskId: string): AgentTask | null {
1543
1546
  .prepare<AgentTaskRow, [string]>(
1544
1547
  `UPDATE agent_tasks
1545
1548
  SET status = 'in_progress',
1549
+ was_paused = 1,
1546
1550
  lastUpdatedAt = strftime('%Y-%m-%dT%H:%M:%fZ', 'now')
1547
1551
  WHERE id = ? AND status = 'paused'
1548
1552
  RETURNING *`,
@@ -5599,6 +5603,18 @@ export function updateActiveSessionProviderSessionId(
5599
5603
  return result.changes > 0;
5600
5604
  }
5601
5605
 
5606
+ /**
5607
+ * Get the active session for a specific task.
5608
+ * Used by the heartbeat to cross-reference stalled tasks with worker sessions.
5609
+ */
5610
+ export function getActiveSessionForTask(taskId: string): ActiveSession | null {
5611
+ return (
5612
+ getDb()
5613
+ .prepare<ActiveSession, [string]>("SELECT * FROM active_sessions WHERE taskId = ? LIMIT 1")
5614
+ .get(taskId) ?? null
5615
+ );
5616
+ }
5617
+
5602
5618
  /**
5603
5619
  * Reassociate session logs from a runner session to a real task ID.
5604
5620
  * Used when a pool task is claimed — logs were stored under a random UUID,
@@ -6222,6 +6238,24 @@ export function getStepByIdempotencyKey(key: string): WorkflowRunStep | null {
6222
6238
  return row ? rowToWorkflowRunStep(row) : null;
6223
6239
  }
6224
6240
 
6241
+ export function getStepCountForNode(runId: string, nodeId: string): number {
6242
+ const row = getDb()
6243
+ .prepare<{ cnt: number }, [string, string]>(
6244
+ "SELECT COUNT(*) as cnt FROM workflow_run_steps WHERE runId = ? AND nodeId = ?",
6245
+ )
6246
+ .get(runId, nodeId);
6247
+ return row?.cnt ?? 0;
6248
+ }
6249
+
6250
+ export function getLatestStepForNode(runId: string, nodeId: string): WorkflowRunStep | null {
6251
+ const row = getDb()
6252
+ .prepare<WorkflowRunStepRow, [string, string]>(
6253
+ "SELECT * FROM workflow_run_steps WHERE runId = ? AND nodeId = ? ORDER BY startedAt DESC LIMIT 1",
6254
+ )
6255
+ .get(runId, nodeId);
6256
+ return row ? rowToWorkflowRunStep(row) : null;
6257
+ }
6258
+
6225
6259
  // --- Workflow Version History ---
6226
6260
 
6227
6261
  type WorkflowVersionRow = {
@@ -7855,6 +7889,13 @@ export function createContextSnapshot(input: CreateContextSnapshotInput): Contex
7855
7889
  .run(input.contextPercent, input.taskId);
7856
7890
  }
7857
7891
 
7892
+ // Keep totalContextTokensUsed up to date with the latest known value
7893
+ if (input.contextUsedTokens != null) {
7894
+ getDb()
7895
+ .prepare("UPDATE agent_tasks SET totalContextTokensUsed = ? WHERE id = ?")
7896
+ .run(input.contextUsedTokens, input.taskId);
7897
+ }
7898
+
7858
7899
  if (input.eventType === "compaction") {
7859
7900
  getDb()
7860
7901
  .prepare(
@@ -7863,13 +7904,10 @@ export function createContextSnapshot(input: CreateContextSnapshotInput): Contex
7863
7904
  .run(input.taskId);
7864
7905
  }
7865
7906
 
7866
- if (input.eventType === "completion") {
7907
+ if (input.eventType === "completion" && input.contextTotalTokens != null) {
7867
7908
  getDb()
7868
- .prepare(
7869
- `UPDATE agent_tasks SET totalContextTokensUsed = ?, contextWindowSize = ?
7870
- WHERE id = ?`,
7871
- )
7872
- .run(input.contextUsedTokens ?? null, input.contextTotalTokens ?? null, input.taskId);
7909
+ .prepare("UPDATE agent_tasks SET contextWindowSize = ? WHERE id = ?")
7910
+ .run(input.contextTotalTokens, input.taskId);
7873
7911
  }
7874
7912
 
7875
7913
  return {
@@ -0,0 +1 @@
1
+ ALTER TABLE agent_tasks ADD COLUMN was_paused INTEGER NOT NULL DEFAULT 0;
@@ -305,6 +305,12 @@ export function humanizeToolName(name: string): string {
305
305
  export function toolCallToProgress(toolName: string, args: unknown): string | null {
306
306
  if (SKIP_PROGRESS_TOOLS.has(toolName)) return null;
307
307
 
308
+ // Normalize: pi-mono uses lowercase ("read"), Claude uses PascalCase ("Read")
309
+ const normalized =
310
+ toolName.startsWith("mcp__") || toolName.includes("_")
311
+ ? toolName
312
+ : toolName.charAt(0).toUpperCase() + toolName.slice(1);
313
+
308
314
  const a = args as Record<string, unknown>;
309
315
  const shortPath = (p: unknown) => {
310
316
  if (typeof p !== "string") return "";
@@ -313,7 +319,7 @@ export function toolCallToProgress(toolName: string, args: unknown): string | nu
313
319
  return parts.length > 2 ? parts.slice(-2).join("/") : p;
314
320
  };
315
321
 
316
- switch (toolName) {
322
+ switch (normalized) {
317
323
  case "Read":
318
324
  return `📖 Reading ${shortPath(a.file_path)}`;
319
325
  case "Edit":
@@ -1067,6 +1073,25 @@ async function cleanupActiveSessions(config: ApiConfig): Promise<void> {
1067
1073
  }
1068
1074
  }
1069
1075
 
1076
+ /** Trigger a heartbeat sweep via the API (lead startup self-check) */
1077
+ async function triggerHeartbeatSweep(config: ApiConfig): Promise<boolean> {
1078
+ try {
1079
+ const headers: Record<string, string> = {
1080
+ "Content-Type": "application/json",
1081
+ "X-Agent-ID": config.agentId,
1082
+ };
1083
+ if (config.apiKey) headers.Authorization = `Bearer ${config.apiKey}`;
1084
+ const resp = await fetch(`${config.apiUrl}/api/heartbeat/sweep`, {
1085
+ method: "POST",
1086
+ headers,
1087
+ });
1088
+ return resp.ok;
1089
+ } catch (err) {
1090
+ console.warn(`[runner] Failed to trigger heartbeat sweep: ${(err as Error).message}`);
1091
+ return false;
1092
+ }
1093
+ }
1094
+
1070
1095
  /** Trigger types returned by the poll API */
1071
1096
  interface Trigger {
1072
1097
  type:
@@ -1991,6 +2016,9 @@ async function checkCompletedProcesses(
1991
2016
  failureReason,
1992
2017
  },
1993
2018
  validator: (data) => data.exitCode === 0,
2019
+ // biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
2020
+ filter: ({}, ctx) => ctx.deps.length > 0,
2021
+ conditions: [{ timeout_ms: 3_600_000 }], // 1 hour: process runtime
1994
2022
  });
1995
2023
 
1996
2024
  // Commit channel activity cursors after successful processing
@@ -2703,6 +2731,17 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
2703
2731
  }
2704
2732
  // ========== END: Resume paused tasks ==========
2705
2733
 
2734
+ // ========== Lead startup self-check ==========
2735
+ if (isLead) {
2736
+ console.log(`[${role}] Running startup heartbeat sweep...`);
2737
+ const swept = await triggerHeartbeatSweep(apiConfig);
2738
+ if (swept) {
2739
+ console.log(`[${role}] Startup heartbeat sweep completed`);
2740
+ } else {
2741
+ console.warn(`[${role}] Startup heartbeat sweep failed (non-fatal)`);
2742
+ }
2743
+ }
2744
+
2706
2745
  // Track last finished task check for leads (to avoid re-processing)
2707
2746
  while (true) {
2708
2747
  // Ping server on each iteration to keep status updated
@@ -2790,6 +2829,9 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
2790
2829
  triggerType: trigger.type,
2791
2830
  role,
2792
2831
  },
2832
+ // biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
2833
+ filter: ({}, ctx) => ctx.deps.length > 0,
2834
+ conditions: [{ timeout_ms: 60_000 }], // 1 min: immediate after poll
2793
2835
  });
2794
2836
  }
2795
2837
 
@@ -3020,6 +3062,9 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
3020
3062
  role,
3021
3063
  model: taskModel,
3022
3064
  },
3065
+ // biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
3066
+ filter: ({}, ctx) => ctx.deps.length > 0,
3067
+ conditions: [{ timeout_ms: 60_000 }], // 1 min: process startup
3023
3068
  });
3024
3069
 
3025
3070
  // Attach trigger metadata for logging
@@ -2,6 +2,9 @@ import {
2
2
  claimTask,
3
3
  cleanupStaleSessions,
4
4
  createTaskExtended,
5
+ deleteActiveSession,
6
+ failTask,
7
+ getActiveSessionForTask,
5
8
  getActiveTaskCount,
6
9
  getAllAgents,
7
10
  getDb,
@@ -29,15 +32,25 @@ import "./templates";
29
32
  /** Default heartbeat interval: 90 seconds */
30
33
  const DEFAULT_INTERVAL_MS = Number(process.env.HEARTBEAT_INTERVAL_MS) || 90_000;
31
34
 
32
- /** Stall threshold: tasks in_progress with no update for this many minutes */
35
+ /** Stall threshold: tasks with fresh worker heartbeat but no task update for this many minutes */
33
36
  const STALL_THRESHOLD_MINUTES = Number(process.env.HEARTBEAT_STALL_THRESHOLD_MIN) || 30;
34
37
 
38
+ /** Stall threshold: tasks with no active session (worker clearly dead) */
39
+ const STALL_THRESHOLD_NO_SESSION_MIN = Number(process.env.HEARTBEAT_STALL_NO_SESSION_MIN) || 5;
40
+
41
+ /** Stall threshold: tasks with stale worker heartbeat */
42
+ const STALL_THRESHOLD_STALE_HEARTBEAT_MIN = Number(process.env.HEARTBEAT_STALL_STALE_HB_MIN) || 15;
43
+
35
44
  /** Stale resource cleanup threshold (minutes) */
36
45
  const STALE_CLEANUP_THRESHOLD_MINUTES = Number(process.env.HEARTBEAT_STALE_CLEANUP_MIN) || 30;
37
46
 
38
47
  /** Max pool tasks to auto-assign per sweep */
39
48
  const MAX_AUTO_ASSIGN_PER_SWEEP = Number(process.env.HEARTBEAT_MAX_AUTO_ASSIGN) || 5;
40
49
 
50
+ /** Escalation cooldown: minimum time between escalations for the same task set (ms) */
51
+ const ESCALATION_COOLDOWN_MS =
52
+ Number(process.env.HEARTBEAT_ESCALATION_COOLDOWN_MS) || 15 * 60 * 1000;
53
+
41
54
  const HEARTBEAT_ESCALATION_MARKER = "[heartbeat-escalation]";
42
55
 
43
56
  // ============================================================================
@@ -46,6 +59,7 @@ const HEARTBEAT_ESCALATION_MARKER = "[heartbeat-escalation]";
46
59
 
47
60
  export interface HeartbeatFindings {
48
61
  stalledTasks: AgentTask[];
62
+ autoFailedTasks: Array<{ taskId: string; agentId: string; reason: string }>;
49
63
  workerHealthFixes: Array<{ agentId: string; oldStatus: string; newStatus: string }>;
50
64
  autoAssigned: Array<{ taskId: string; agentId: string }>;
51
65
  staleCleanup: {
@@ -66,6 +80,9 @@ export interface HeartbeatFindings {
66
80
  let heartbeatInterval: ReturnType<typeof setInterval> | null = null;
67
81
  let isSweeping = false;
68
82
 
83
+ /** Tracks last escalation time per escalation key to prevent spam */
84
+ const lastEscalationTime: Map<string, number> = new Map();
85
+
69
86
  // ============================================================================
70
87
  // Tier 1: Preflight Gate
71
88
  // ============================================================================
@@ -106,6 +123,7 @@ export function preflightGate(): boolean {
106
123
  export async function codeLevelTriage(): Promise<HeartbeatFindings> {
107
124
  const findings: HeartbeatFindings = {
108
125
  stalledTasks: [],
126
+ autoFailedTasks: [],
109
127
  workerHealthFixes: [],
110
128
  autoAssigned: [],
111
129
  staleCleanup: {
@@ -118,8 +136,8 @@ export async function codeLevelTriage(): Promise<HeartbeatFindings> {
118
136
  escalationNeeded: false,
119
137
  };
120
138
 
121
- // 1. Detect stalled tasks
122
- detectStalledTasks(findings);
139
+ // 1. Detect and remediate stalled tasks (tiered: auto-fail dead workers, escalate ambiguous)
140
+ detectAndRemediateStalledTasks(findings);
123
141
 
124
142
  // 2. Check and fix worker health
125
143
  checkWorkerHealth(findings);
@@ -137,11 +155,72 @@ export async function codeLevelTriage(): Promise<HeartbeatFindings> {
137
155
  }
138
156
 
139
157
  /**
140
- * Detect in_progress tasks that haven't been updated in a while.
158
+ * Tiered stall detection and auto-remediation.
159
+ *
160
+ * Cross-checks stalled tasks with active_sessions to determine severity:
161
+ * - No active session → worker is dead → auto-fail (5 min threshold)
162
+ * - Stale session heartbeat → worker likely crashed → auto-fail (15 min threshold)
163
+ * - Fresh session heartbeat → worker alive but task stale → escalate to lead (30 min threshold)
141
164
  */
142
- function detectStalledTasks(findings: HeartbeatFindings): void {
143
- const stalled = getStalledInProgressTasks(STALL_THRESHOLD_MINUTES);
144
- findings.stalledTasks = stalled;
165
+ function detectAndRemediateStalledTasks(findings: HeartbeatFindings): void {
166
+ // Use the shortest threshold to catch all potentially stalled tasks
167
+ const candidates = getStalledInProgressTasks(STALL_THRESHOLD_NO_SESSION_MIN);
168
+
169
+ for (const task of candidates) {
170
+ if (!task.agentId) continue; // Unassigned tasks can't be stalled
171
+
172
+ const session = getActiveSessionForTask(task.id);
173
+ const taskAgeMs = Date.now() - new Date(task.lastUpdatedAt).getTime();
174
+
175
+ if (!session) {
176
+ // Case A: No active session — worker is dead
177
+ if (taskAgeMs >= STALL_THRESHOLD_NO_SESSION_MIN * 60 * 1000) {
178
+ const reason =
179
+ "Auto-failed by heartbeat: worker session not found (no active session for task)";
180
+ const failed = failTask(task.id, reason);
181
+ if (failed) {
182
+ findings.autoFailedTasks.push({ taskId: task.id, agentId: task.agentId, reason });
183
+ console.log(`[Heartbeat] Auto-failed task ${task.id.slice(0, 8)} — no active session`);
184
+
185
+ // Fix agent status if no other active tasks
186
+ const remaining = getActiveTaskCount(task.agentId);
187
+ if (remaining === 0) {
188
+ updateAgentStatus(task.agentId, "idle");
189
+ }
190
+ }
191
+ }
192
+ } else {
193
+ const sessionHeartbeatAgeMs = Date.now() - new Date(session.lastHeartbeatAt).getTime();
194
+ const isStaleHeartbeat =
195
+ sessionHeartbeatAgeMs >= STALL_THRESHOLD_STALE_HEARTBEAT_MIN * 60 * 1000;
196
+
197
+ if (isStaleHeartbeat) {
198
+ // Case B: Session exists but heartbeat is stale — worker likely crashed
199
+ if (taskAgeMs >= STALL_THRESHOLD_STALE_HEARTBEAT_MIN * 60 * 1000) {
200
+ const reason =
201
+ "Auto-failed by heartbeat: worker session heartbeat is stale (likely crashed)";
202
+ const failed = failTask(task.id, reason);
203
+ if (failed) {
204
+ findings.autoFailedTasks.push({ taskId: task.id, agentId: task.agentId, reason });
205
+ deleteActiveSession(task.id);
206
+ console.log(
207
+ `[Heartbeat] Auto-failed task ${task.id.slice(0, 8)} — stale session heartbeat`,
208
+ );
209
+
210
+ const remaining = getActiveTaskCount(task.agentId);
211
+ if (remaining === 0) {
212
+ updateAgentStatus(task.agentId, "idle");
213
+ }
214
+ }
215
+ }
216
+ } else {
217
+ // Case C: Session exists and heartbeat is fresh — ambiguous
218
+ if (taskAgeMs >= STALL_THRESHOLD_MINUTES * 60 * 1000) {
219
+ findings.stalledTasks.push(task);
220
+ }
221
+ }
222
+ }
223
+ }
145
224
  }
146
225
 
147
226
  /**
@@ -232,15 +311,13 @@ async function cleanupStaleResources(findings: HeartbeatFindings): Promise<void>
232
311
 
233
312
  /**
234
313
  * Evaluate whether findings require escalation to a Claude session (lead agent).
235
- * Only escalate for truly ambiguous situations that need human-level reasoning.
314
+ * Only escalate for ambiguous stalls (worker alive but task not updating).
236
315
  */
237
316
  function evaluateEscalation(findings: HeartbeatFindings): void {
238
- // Stalled tasks are ambiguous — the task might be actively worked on
239
- // but the worker just hasn't called store-progress recently
240
317
  if (findings.stalledTasks.length > 0) {
241
318
  findings.escalationNeeded = true;
242
319
  const taskIds = findings.stalledTasks.map((t) => t.id.slice(0, 8)).join(", ");
243
- findings.escalationReason = `${findings.stalledTasks.length} task(s) stalled (no update for ${STALL_THRESHOLD_MINUTES}+ min): ${taskIds}`;
320
+ findings.escalationReason = `${findings.stalledTasks.length} task(s) stalled with active worker (no task update for ${STALL_THRESHOLD_MINUTES}+ min): ${taskIds}`;
244
321
  }
245
322
  }
246
323
 
@@ -255,6 +332,13 @@ function escalateToLead(findings: HeartbeatFindings): void {
255
332
  }
256
333
 
257
334
  const escalationKey = buildEscalationKey(findings);
335
+
336
+ // Cooldown check — prevent repeated escalations for the same task set
337
+ const lastTime = lastEscalationTime.get(escalationKey);
338
+ if (lastTime && Date.now() - lastTime < ESCALATION_COOLDOWN_MS) {
339
+ return;
340
+ }
341
+
258
342
  if (hasActiveEscalationTask(lead.id, escalationKey)) {
259
343
  return;
260
344
  }
@@ -294,6 +378,7 @@ function escalateToLead(findings: HeartbeatFindings): void {
294
378
  priority: 70,
295
379
  });
296
380
 
381
+ lastEscalationTime.set(escalationKey, Date.now());
297
382
  console.log(`[Heartbeat] Created triage task for lead ${lead.name}`);
298
383
  }
299
384
 
@@ -337,6 +422,7 @@ export async function runHeartbeatSweep(): Promise<void> {
337
422
  if (!preflightGate()) {
338
423
  const cleanupOnlyFindings: HeartbeatFindings = {
339
424
  stalledTasks: [],
425
+ autoFailedTasks: [],
340
426
  workerHealthFixes: [],
341
427
  autoAssigned: [],
342
428
  staleCleanup: {
@@ -374,6 +460,9 @@ export async function runHeartbeatSweep(): Promise<void> {
374
460
  function logFindings(findings: HeartbeatFindings): void {
375
461
  const parts: string[] = [];
376
462
 
463
+ if (findings.autoFailedTasks.length > 0) {
464
+ parts.push(`auto_failed=${findings.autoFailedTasks.length}`);
465
+ }
377
466
  if (findings.stalledTasks.length > 0) {
378
467
  parts.push(`stalled=${findings.stalledTasks.length}`);
379
468
  }
@@ -432,3 +521,10 @@ export function stopHeartbeat(): void {
432
521
  console.log("[Heartbeat] Stopped");
433
522
  }
434
523
  }
524
+
525
+ /**
526
+ * Reset escalation cooldown state. Exported for testing only.
527
+ */
528
+ export function resetEscalationCooldowns(): void {
529
+ lastEscalationTime.clear();
530
+ }
@@ -205,6 +205,9 @@ export async function handleAgentRegister(
205
205
  // Validates that registered happened before reconnected
206
206
  return ctx.deps.length > 0;
207
207
  },
208
+ // biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
209
+ filter: ({}, ctx) => ctx.deps.length > 0,
210
+ conditions: [{ timeout_ms: 86_400_000 }], // 1 day: agents may be offline for extended periods
208
211
  });
209
212
  }
210
213
 
@@ -0,0 +1,43 @@
1
+ import type { IncomingMessage, ServerResponse } from "node:http";
2
+ import { runHeartbeatSweep } from "../heartbeat/heartbeat";
3
+ import { route } from "./route-def";
4
+ import { json } from "./utils";
5
+
6
+ // ─── Route Definitions ───────────────────────────────────────────────────────
7
+
8
+ const triggerSweep = route({
9
+ method: "post",
10
+ path: "/api/heartbeat/sweep",
11
+ pattern: ["api", "heartbeat", "sweep"],
12
+ summary: "Trigger an immediate heartbeat sweep",
13
+ tags: ["Heartbeat"],
14
+ responses: {
15
+ 200: { description: "Sweep completed successfully" },
16
+ 401: { description: "Unauthorized" },
17
+ },
18
+ auth: { apiKey: true },
19
+ });
20
+
21
+ // ─── Handler ─────────────────────────────────────────────────────────────────
22
+
23
+ export async function handleHeartbeat(
24
+ req: IncomingMessage,
25
+ res: ServerResponse,
26
+ pathSegments: string[],
27
+ ): Promise<boolean> {
28
+ if (triggerSweep.match(req.method, pathSegments)) {
29
+ const parsed = await triggerSweep.parse(req, res, pathSegments, new URLSearchParams());
30
+ if (!parsed) return true;
31
+
32
+ try {
33
+ await runHeartbeatSweep();
34
+ json(res, { success: true, message: "Heartbeat sweep completed" });
35
+ } catch (err) {
36
+ const message = err instanceof Error ? err.message : "Unknown error during heartbeat sweep";
37
+ json(res, { success: false, error: message }, 500);
38
+ }
39
+ return true;
40
+ }
41
+
42
+ return false;
43
+ }
package/src/http/index.ts CHANGED
@@ -4,7 +4,7 @@ import {
4
4
  type Server,
5
5
  type ServerResponse,
6
6
  } from "node:http";
7
- import { assert, initialize } from "@desplega.ai/business-use";
7
+ import { ensure, initialize } from "@desplega.ai/business-use";
8
8
  import type { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
9
9
  import { getEnabledCapabilities, hasCapability } from "@/server";
10
10
  import { initAgentMail } from "../agentmail";
@@ -25,6 +25,7 @@ import { handleDbQuery } from "./db-query";
25
25
  import { handleEcosystem } from "./ecosystem";
26
26
  import { handleEpics } from "./epics";
27
27
  import { handleEvents } from "./events";
28
+ import { handleHeartbeat } from "./heartbeat";
28
29
  import { handleMcp } from "./mcp";
29
30
  import { handleMcpServers } from "./mcp-servers";
30
31
  import { handleMemory } from "./memory";
@@ -120,6 +121,7 @@ const httpServer = createHttpServer(async (req, res) => {
120
121
  () => handleSkills(req, res, pathSegments, queryParams, myAgentId),
121
122
  () => handleMcpServers(req, res, pathSegments, queryParams),
122
123
  () => handleMemory(req, res, pathSegments, myAgentId),
124
+ () => handleHeartbeat(req, res, pathSegments),
123
125
  () => handleEvents(req, res, pathSegments, queryParams, myAgentId),
124
126
  () => handleMcp(req, res, transports),
125
127
  ];
@@ -186,7 +188,7 @@ httpServer
186
188
  .listen(port, async () => {
187
189
  console.log(`MCP HTTP server running on http://localhost:${port}/mcp`);
188
190
 
189
- assert({
191
+ ensure({
190
192
  id: "listen",
191
193
  flow: "api",
192
194
  runId: globalState.__runId!,
package/src/http/poll.ts CHANGED
@@ -141,6 +141,9 @@ export async function handlePoll(
141
141
  previousStatus: pendingTask.status,
142
142
  },
143
143
  validator: (data) => data.previousStatus === "pending",
144
+ // biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
145
+ filter: ({}, ctx) => ctx.deps.length > 0,
146
+ conditions: [{ timeout_ms: 300_000 }], // 5 min: polling interval + queue wait
144
147
  });
145
148
 
146
149
  return {