npm - @desplega.ai/agent-swarm - Versions diffs - 1.53.0 → 1.54.1 - Mend

@desplega.ai/agent-swarm 1.53.0 → 1.54.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/README.md +2 -0
package/openapi.json +22 -1
package/package.json +1 -1
package/plugin/commands/work-on-task.md +11 -5
package/plugin/pi-skills/work-on-task/SKILL.md +11 -5
package/src/be/db.ts +44 -6
package/src/be/migrations/024_add_was_paused.sql +1 -0
package/src/commands/runner.ts +46 -1
package/src/heartbeat/heartbeat.ts +107 -11
package/src/http/agents.ts +3 -0
package/src/http/heartbeat.ts +43 -0
package/src/http/index.ts +4 -2
package/src/http/poll.ts +3 -0
package/src/http/tasks.ts +27 -4
package/src/linear/sync.ts +38 -11
package/src/linear/templates.ts +17 -0
package/src/providers/pi-mono-adapter.ts +25 -0
package/src/scheduler/scheduler.ts +1 -0
package/src/tests/context-snapshot.test.ts +127 -0
package/src/tests/events-db.test.ts +0 -1
package/src/tests/events-http.test.ts +10 -4
package/src/tests/heartbeat.test.ts +148 -6
package/src/tests/linear-webhook.test.ts +105 -4
package/src/tests/workflow-hitl-routing.test.ts +545 -0
package/src/tools/store-progress.ts +8 -2
package/src/types.ts +3 -0
package/src/workflows/engine.ts +59 -18
package/src/workflows/recovery.ts +4 -4
package/src/workflows/resume.ts +21 -15

package/README.md CHANGED Viewed

@@ -58,6 +58,8 @@ Agent Swarm lets you run a team of AI coding agents that coordinate autonomously
 - **Onboarding wizard** — Interactive CLI wizard (`agent-swarm onboard`) to set up a new swarm from scratch with presets, credential collection, and docker-compose generation
 - **Skill system** — Reusable procedural knowledge: create, install, publish, and sync skills from GitHub with scope resolution (agent → swarm → global)
 - **Human-in-the-Loop** — Workflow nodes that pause for human approval or input, with a dashboard UI for reviewing and responding to requests
+- **MCP server management** — Register, install, and manage MCP servers for agents with scope cascade (agent → swarm → global) and auto-injection into worker containers
+- **Context usage tracking** — Monitor context window utilization and compaction events per task with visual indicators in the dashboard
 ## Quick Start

package/openapi.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "openapi": "3.1.0",
   "info": {
     "title": "Agent Swarm API",
-    "version": "1.52.1",
+    "version": "1.53.1",
     "description": "Multi-agent orchestration API for Claude Code, Codex, and Gemini CLI. Enables task distribution, agent communication, and service discovery.\n\nMCP tools are documented separately in [MCP.md](./MCP.md)."
   },
   "servers": [
@@ -2424,6 +2424,27 @@
         }
       }
     },
+    "/api/heartbeat/sweep": {
+      "post": {
+        "summary": "Trigger an immediate heartbeat sweep",
+        "tags": [
+          "Heartbeat"
+        ],
+        "security": [
+          {
+            "bearerAuth": []
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "Sweep completed successfully"
+          },
+          "401": {
+            "description": "Unauthorized"
+          }
+        }
+      }
+    },
     "/api/memory/index": {
       "post": {
         "summary": "Ingest content into memory system (async embedding)",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@desplega.ai/agent-swarm",
-  "version": "1.53.0",
+  "version": "1.54.1",
   "description": "Multi-agent orchestration for Claude Code, Codex, Gemini CLI, and other AI coding assistants",
   "license": "MIT",
   "author": "desplega.sh <contact@desplega.sh>",

package/plugin/commands/work-on-task.md CHANGED Viewed

@@ -21,11 +21,17 @@ Once you have the task details, you should:
    - Use `memory-get` on any highly relevant results to get full details
    - This step is NOT optional. Past learnings compound your effectiveness.
 <!-- /claude-only -->
-2. Figure out if you need to use any of the available commands to help you with your work (see below for available commands)
-2. Use the `/todos` command to add a new todo item indicating you are starting to work on the task (e.g. "Work on task XXX: <short description>"). This will help on restarts, as it will be easier to remember what you were doing.
-3. Call `store-progress` tool to mark the task as "in-progress" with a progress set to something like "Starting work on the task XXX, blah blah". Additionally use `/swarm-chat` command to notify the swarm, human and lead when applicable. Do not be too verbose, nor spammy.
-4. Start working on the task, providing updates as needed by calling `store-progress` tool, use the `progress` field to indicate what you are doing.
-5. Once you either done or in a dead-end, see the "Completion" section below.
+2. **Check Installed Skills (REQUIRED):** Before researching or implementing, review your "Installed Skills" section in the system prompt:
+   - If any skill's description or trigger matches this task, invoke it via the `Skill` tool BEFORE doing manual research
+   - Skills contain pre-built, tested procedures that save context window and cost
+   - Example: task involves Linear → use `linear-interaction` skill, task involves email → use `agentmail-sending` skill
+   - Only proceed to manual research/web search if NO installed skill covers the task
+   - This step is NOT optional. Skipping it wastes context and money.
+3. Figure out if you need to use any of the available commands to help you with your work (see below for available commands)
+4. Use the `/todos` command to add a new todo item indicating you are starting to work on the task (e.g. "Work on task XXX: <short description>"). This will help on restarts, as it will be easier to remember what you were doing.
+5. Call `store-progress` tool to mark the task as "in-progress" with a progress set to something like "Starting work on the task XXX, blah blah". Additionally use `/swarm-chat` command to notify the swarm, human and lead when applicable. Do not be too verbose, nor spammy.
+6. Start working on the task, providing updates as needed by calling `store-progress` tool, use the `progress` field to indicate what you are doing.
+7. Once you either done or in a dead-end, see the "Completion" section below.
 ### Available commands

package/plugin/pi-skills/work-on-task/SKILL.md CHANGED Viewed

@@ -13,11 +13,17 @@ Once you get a task assigned, you need to immediately start working on it. To do
 Once you have the task details, you should:
-1. Figure out if you need to perform any research or planning before starting (see below)
-2. Use the `/skill:todos` to add a new todo item indicating you are starting to work on the task (e.g. "Work on task XXX: <short description>"). This will help on restarts, as it will be easier to remember what you were doing.
-3. Call `store-progress` tool to mark the task as "in-progress" with a progress set to something like "Starting work on the task XXX, blah blah". Additionally use `/skill:swarm-chat` to notify the swarm, human and lead when applicable. Do not be too verbose, nor spammy.
-4. Start working on the task, providing updates as needed by calling `store-progress` tool, use the `progress` field to indicate what you are doing.
-5. Once you either done or in a dead-end, see the "Completion" section below.
+1. **Check Installed Skills (REQUIRED):** Before researching or implementing, review your "Installed Skills" section in the system prompt:
+   - If any skill's description or trigger matches this task, invoke it via the `Skill` tool BEFORE doing manual research
+   - Skills contain pre-built, tested procedures that save context window and cost
+   - Example: task involves Linear → use `linear-interaction` skill, task involves email → use `agentmail-sending` skill
+   - Only proceed to manual research/web search if NO installed skill covers the task
+   - This step is NOT optional. Skipping it wastes context and money.
+2. Figure out if you need to perform any research or planning before starting (see below)
+3. Use the `/skill:todos` to add a new todo item indicating you are starting to work on the task (e.g. "Work on task XXX: <short description>"). This will help on restarts, as it will be easier to remember what you were doing.
+4. Call `store-progress` tool to mark the task as "in-progress" with a progress set to something like "Starting work on the task XXX, blah blah". Additionally use `/skill:swarm-chat` to notify the swarm, human and lead when applicable. Do not be too verbose, nor spammy.
+5. Start working on the task, providing updates as needed by calling `store-progress` tool, use the `progress` field to indicate what you are doing.
+6. Once you either done or in a dead-end, see the "Completion" section below.
 ### Research and Planning

package/src/be/db.ts CHANGED Viewed

@@ -728,6 +728,7 @@ type AgentTaskRow = {
   peakContextPercent: number | null;
   totalContextTokensUsed: number | null;
   contextWindowSize: number | null;
+  was_paused: number;
 };
 function rowToAgentTask(row: AgentTaskRow): AgentTask {
@@ -781,6 +782,7 @@ function rowToAgentTask(row: AgentTaskRow): AgentTask {
     failureReason: row.failureReason ?? undefined,
     output: row.output ?? undefined,
     progress: row.progress ?? undefined,
+    wasPaused: !!row.was_paused,
   };
 }
@@ -1509,6 +1511,7 @@ export function pauseTask(id: string): AgentTask | null {
     .prepare<AgentTaskRow, [string]>(
       `UPDATE agent_tasks
        SET status = 'paused',
+           was_paused = 1,
            lastUpdatedAt = strftime('%Y-%m-%dT%H:%M:%fZ', 'now')
        WHERE id = ? AND status = 'in_progress'
        RETURNING *`,
@@ -1543,6 +1546,7 @@ export function resumeTask(taskId: string): AgentTask | null {
     .prepare<AgentTaskRow, [string]>(
       `UPDATE agent_tasks
        SET status = 'in_progress',
+           was_paused = 1,
            lastUpdatedAt = strftime('%Y-%m-%dT%H:%M:%fZ', 'now')
        WHERE id = ? AND status = 'paused'
        RETURNING *`,
@@ -5599,6 +5603,18 @@ export function updateActiveSessionProviderSessionId(
   return result.changes > 0;
 }
+/**
+ * Get the active session for a specific task.
+ * Used by the heartbeat to cross-reference stalled tasks with worker sessions.
+ */
+export function getActiveSessionForTask(taskId: string): ActiveSession | null {
+  return (
+    getDb()
+      .prepare<ActiveSession, [string]>("SELECT * FROM active_sessions WHERE taskId = ? LIMIT 1")
+      .get(taskId) ?? null
+  );
+}
 /**
  * Reassociate session logs from a runner session to a real task ID.
  * Used when a pool task is claimed — logs were stored under a random UUID,
@@ -6222,6 +6238,24 @@ export function getStepByIdempotencyKey(key: string): WorkflowRunStep | null {
   return row ? rowToWorkflowRunStep(row) : null;
 }
+export function getStepCountForNode(runId: string, nodeId: string): number {
+  const row = getDb()
+    .prepare<{ cnt: number }, [string, string]>(
+      "SELECT COUNT(*) as cnt FROM workflow_run_steps WHERE runId = ? AND nodeId = ?",
+    )
+    .get(runId, nodeId);
+  return row?.cnt ?? 0;
+}
+export function getLatestStepForNode(runId: string, nodeId: string): WorkflowRunStep | null {
+  const row = getDb()
+    .prepare<WorkflowRunStepRow, [string, string]>(
+      "SELECT * FROM workflow_run_steps WHERE runId = ? AND nodeId = ? ORDER BY startedAt DESC LIMIT 1",
+    )
+    .get(runId, nodeId);
+  return row ? rowToWorkflowRunStep(row) : null;
+}
 // --- Workflow Version History ---
 type WorkflowVersionRow = {
@@ -7855,6 +7889,13 @@ export function createContextSnapshot(input: CreateContextSnapshotInput): Contex
       .run(input.contextPercent, input.taskId);
   }
+  // Keep totalContextTokensUsed up to date with the latest known value
+  if (input.contextUsedTokens != null) {
+    getDb()
+      .prepare("UPDATE agent_tasks SET totalContextTokensUsed = ? WHERE id = ?")
+      .run(input.contextUsedTokens, input.taskId);
+  }
   if (input.eventType === "compaction") {
     getDb()
       .prepare(
@@ -7863,13 +7904,10 @@ export function createContextSnapshot(input: CreateContextSnapshotInput): Contex
       .run(input.taskId);
   }
-  if (input.eventType === "completion") {
+  if (input.eventType === "completion" && input.contextTotalTokens != null) {
     getDb()
-      .prepare(
-        `UPDATE agent_tasks SET totalContextTokensUsed = ?, contextWindowSize = ?
-         WHERE id = ?`,
-      )
-      .run(input.contextUsedTokens ?? null, input.contextTotalTokens ?? null, input.taskId);
+      .prepare("UPDATE agent_tasks SET contextWindowSize = ? WHERE id = ?")
+      .run(input.contextTotalTokens, input.taskId);
   }
   return {

package/src/be/migrations/024_add_was_paused.sql ADDED Viewed

	@@ -0,0 +1 @@
1	+ ALTER TABLE agent_tasks ADD COLUMN was_paused INTEGER NOT NULL DEFAULT 0;

package/src/commands/runner.ts CHANGED Viewed

@@ -305,6 +305,12 @@ export function humanizeToolName(name: string): string {
 export function toolCallToProgress(toolName: string, args: unknown): string | null {
   if (SKIP_PROGRESS_TOOLS.has(toolName)) return null;
+  // Normalize: pi-mono uses lowercase ("read"), Claude uses PascalCase ("Read")
+  const normalized =
+    toolName.startsWith("mcp__") || toolName.includes("_")
+      ? toolName
+      : toolName.charAt(0).toUpperCase() + toolName.slice(1);
   const a = args as Record<string, unknown>;
   const shortPath = (p: unknown) => {
     if (typeof p !== "string") return "";
@@ -313,7 +319,7 @@ export function toolCallToProgress(toolName: string, args: unknown): string | nu
     return parts.length > 2 ? parts.slice(-2).join("/") : p;
   };
-  switch (toolName) {
+  switch (normalized) {
     case "Read":
       return `📖 Reading ${shortPath(a.file_path)}`;
     case "Edit":
@@ -1067,6 +1073,25 @@ async function cleanupActiveSessions(config: ApiConfig): Promise<void> {
   }
 }
+/** Trigger a heartbeat sweep via the API (lead startup self-check) */
+async function triggerHeartbeatSweep(config: ApiConfig): Promise<boolean> {
+  try {
+    const headers: Record<string, string> = {
+      "Content-Type": "application/json",
+      "X-Agent-ID": config.agentId,
+    };
+    if (config.apiKey) headers.Authorization = `Bearer ${config.apiKey}`;
+    const resp = await fetch(`${config.apiUrl}/api/heartbeat/sweep`, {
+      method: "POST",
+      headers,
+    });
+    return resp.ok;
+  } catch (err) {
+    console.warn(`[runner] Failed to trigger heartbeat sweep: ${(err as Error).message}`);
+    return false;
+  }
+}
 /** Trigger types returned by the poll API */
 interface Trigger {
   type:
@@ -1991,6 +2016,9 @@ async function checkCompletedProcesses(
           failureReason,
         },
         validator: (data) => data.exitCode === 0,
+        // biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
+        filter: ({}, ctx) => ctx.deps.length > 0,
+        conditions: [{ timeout_ms: 3_600_000 }], // 1 hour: process runtime
       });
       // Commit channel activity cursors after successful processing
@@ -2703,6 +2731,17 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
     }
     // ========== END: Resume paused tasks ==========
+    // ========== Lead startup self-check ==========
+    if (isLead) {
+      console.log(`[${role}] Running startup heartbeat sweep...`);
+      const swept = await triggerHeartbeatSweep(apiConfig);
+      if (swept) {
+        console.log(`[${role}] Startup heartbeat sweep completed`);
+      } else {
+        console.warn(`[${role}] Startup heartbeat sweep failed (non-fatal)`);
+      }
+    }
     // Track last finished task check for leads (to avoid re-processing)
     while (true) {
       // Ping server on each iteration to keep status updated
@@ -2790,6 +2829,9 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
                 triggerType: trigger.type,
                 role,
               },
+              // biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
+              filter: ({}, ctx) => ctx.deps.length > 0,
+              conditions: [{ timeout_ms: 60_000 }], // 1 min: immediate after poll
             });
           }
@@ -3020,6 +3062,9 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
               role,
               model: taskModel,
             },
+            // biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
+            filter: ({}, ctx) => ctx.deps.length > 0,
+            conditions: [{ timeout_ms: 60_000 }], // 1 min: process startup
           });
           // Attach trigger metadata for logging

package/src/heartbeat/heartbeat.ts CHANGED Viewed

@@ -2,6 +2,9 @@ import {
   claimTask,
   cleanupStaleSessions,
   createTaskExtended,
+  deleteActiveSession,
+  failTask,
+  getActiveSessionForTask,
   getActiveTaskCount,
   getAllAgents,
   getDb,
@@ -29,15 +32,25 @@ import "./templates";
 /** Default heartbeat interval: 90 seconds */
 const DEFAULT_INTERVAL_MS = Number(process.env.HEARTBEAT_INTERVAL_MS) || 90_000;
-/** Stall threshold: tasks in_progress with no update for this many minutes */
+/** Stall threshold: tasks with fresh worker heartbeat but no task update for this many minutes */
 const STALL_THRESHOLD_MINUTES = Number(process.env.HEARTBEAT_STALL_THRESHOLD_MIN) || 30;
+/** Stall threshold: tasks with no active session (worker clearly dead) */
+const STALL_THRESHOLD_NO_SESSION_MIN = Number(process.env.HEARTBEAT_STALL_NO_SESSION_MIN) || 5;
+/** Stall threshold: tasks with stale worker heartbeat */
+const STALL_THRESHOLD_STALE_HEARTBEAT_MIN = Number(process.env.HEARTBEAT_STALL_STALE_HB_MIN) || 15;
 /** Stale resource cleanup threshold (minutes) */
 const STALE_CLEANUP_THRESHOLD_MINUTES = Number(process.env.HEARTBEAT_STALE_CLEANUP_MIN) || 30;
 /** Max pool tasks to auto-assign per sweep */
 const MAX_AUTO_ASSIGN_PER_SWEEP = Number(process.env.HEARTBEAT_MAX_AUTO_ASSIGN) || 5;
+/** Escalation cooldown: minimum time between escalations for the same task set (ms) */
+const ESCALATION_COOLDOWN_MS =
+  Number(process.env.HEARTBEAT_ESCALATION_COOLDOWN_MS) || 15 * 60 * 1000;
 const HEARTBEAT_ESCALATION_MARKER = "[heartbeat-escalation]";
 // ============================================================================
@@ -46,6 +59,7 @@ const HEARTBEAT_ESCALATION_MARKER = "[heartbeat-escalation]";
 export interface HeartbeatFindings {
   stalledTasks: AgentTask[];
+  autoFailedTasks: Array<{ taskId: string; agentId: string; reason: string }>;
   workerHealthFixes: Array<{ agentId: string; oldStatus: string; newStatus: string }>;
   autoAssigned: Array<{ taskId: string; agentId: string }>;
   staleCleanup: {
@@ -66,6 +80,9 @@ export interface HeartbeatFindings {
 let heartbeatInterval: ReturnType<typeof setInterval> | null = null;
 let isSweeping = false;
+/** Tracks last escalation time per escalation key to prevent spam */
+const lastEscalationTime: Map<string, number> = new Map();
 // ============================================================================
 // Tier 1: Preflight Gate
 // ============================================================================
@@ -106,6 +123,7 @@ export function preflightGate(): boolean {
 export async function codeLevelTriage(): Promise<HeartbeatFindings> {
   const findings: HeartbeatFindings = {
     stalledTasks: [],
+    autoFailedTasks: [],
     workerHealthFixes: [],
     autoAssigned: [],
     staleCleanup: {
@@ -118,8 +136,8 @@ export async function codeLevelTriage(): Promise<HeartbeatFindings> {
     escalationNeeded: false,
   };
-  // 1. Detect stalled tasks
-  detectStalledTasks(findings);
+  // 1. Detect and remediate stalled tasks (tiered: auto-fail dead workers, escalate ambiguous)
+  detectAndRemediateStalledTasks(findings);
   // 2. Check and fix worker health
   checkWorkerHealth(findings);
@@ -137,11 +155,72 @@ export async function codeLevelTriage(): Promise<HeartbeatFindings> {
 }
 /**
- * Detect in_progress tasks that haven't been updated in a while.
+ * Tiered stall detection and auto-remediation.
+ *
+ * Cross-checks stalled tasks with active_sessions to determine severity:
+ * - No active session → worker is dead → auto-fail (5 min threshold)
+ * - Stale session heartbeat → worker likely crashed → auto-fail (15 min threshold)
+ * - Fresh session heartbeat → worker alive but task stale → escalate to lead (30 min threshold)
  */
-function detectStalledTasks(findings: HeartbeatFindings): void {
-  const stalled = getStalledInProgressTasks(STALL_THRESHOLD_MINUTES);
-  findings.stalledTasks = stalled;
+function detectAndRemediateStalledTasks(findings: HeartbeatFindings): void {
+  // Use the shortest threshold to catch all potentially stalled tasks
+  const candidates = getStalledInProgressTasks(STALL_THRESHOLD_NO_SESSION_MIN);
+  for (const task of candidates) {
+    if (!task.agentId) continue; // Unassigned tasks can't be stalled
+    const session = getActiveSessionForTask(task.id);
+    const taskAgeMs = Date.now() - new Date(task.lastUpdatedAt).getTime();
+    if (!session) {
+      // Case A: No active session — worker is dead
+      if (taskAgeMs >= STALL_THRESHOLD_NO_SESSION_MIN * 60 * 1000) {
+        const reason =
+          "Auto-failed by heartbeat: worker session not found (no active session for task)";
+        const failed = failTask(task.id, reason);
+        if (failed) {
+          findings.autoFailedTasks.push({ taskId: task.id, agentId: task.agentId, reason });
+          console.log(`[Heartbeat] Auto-failed task ${task.id.slice(0, 8)} — no active session`);
+          // Fix agent status if no other active tasks
+          const remaining = getActiveTaskCount(task.agentId);
+          if (remaining === 0) {
+            updateAgentStatus(task.agentId, "idle");
+          }
+        }
+      }
+    } else {
+      const sessionHeartbeatAgeMs = Date.now() - new Date(session.lastHeartbeatAt).getTime();
+      const isStaleHeartbeat =
+        sessionHeartbeatAgeMs >= STALL_THRESHOLD_STALE_HEARTBEAT_MIN * 60 * 1000;
+      if (isStaleHeartbeat) {
+        // Case B: Session exists but heartbeat is stale — worker likely crashed
+        if (taskAgeMs >= STALL_THRESHOLD_STALE_HEARTBEAT_MIN * 60 * 1000) {
+          const reason =
+            "Auto-failed by heartbeat: worker session heartbeat is stale (likely crashed)";
+          const failed = failTask(task.id, reason);
+          if (failed) {
+            findings.autoFailedTasks.push({ taskId: task.id, agentId: task.agentId, reason });
+            deleteActiveSession(task.id);
+            console.log(
+              `[Heartbeat] Auto-failed task ${task.id.slice(0, 8)} — stale session heartbeat`,
+            );
+            const remaining = getActiveTaskCount(task.agentId);
+            if (remaining === 0) {
+              updateAgentStatus(task.agentId, "idle");
+            }
+          }
+        }
+      } else {
+        // Case C: Session exists and heartbeat is fresh — ambiguous
+        if (taskAgeMs >= STALL_THRESHOLD_MINUTES * 60 * 1000) {
+          findings.stalledTasks.push(task);
+        }
+      }
+    }
+  }
 }
 /**
@@ -232,15 +311,13 @@ async function cleanupStaleResources(findings: HeartbeatFindings): Promise<void>
 /**
  * Evaluate whether findings require escalation to a Claude session (lead agent).
- * Only escalate for truly ambiguous situations that need human-level reasoning.
+ * Only escalate for ambiguous stalls (worker alive but task not updating).
  */
 function evaluateEscalation(findings: HeartbeatFindings): void {
-  // Stalled tasks are ambiguous — the task might be actively worked on
-  // but the worker just hasn't called store-progress recently
   if (findings.stalledTasks.length > 0) {
     findings.escalationNeeded = true;
     const taskIds = findings.stalledTasks.map((t) => t.id.slice(0, 8)).join(", ");
-    findings.escalationReason = `${findings.stalledTasks.length} task(s) stalled (no update for ${STALL_THRESHOLD_MINUTES}+ min): ${taskIds}`;
+    findings.escalationReason = `${findings.stalledTasks.length} task(s) stalled with active worker (no task update for ${STALL_THRESHOLD_MINUTES}+ min): ${taskIds}`;
   }
 }
@@ -255,6 +332,13 @@ function escalateToLead(findings: HeartbeatFindings): void {
   }
   const escalationKey = buildEscalationKey(findings);
+  // Cooldown check — prevent repeated escalations for the same task set
+  const lastTime = lastEscalationTime.get(escalationKey);
+  if (lastTime && Date.now() - lastTime < ESCALATION_COOLDOWN_MS) {
+    return;
+  }
   if (hasActiveEscalationTask(lead.id, escalationKey)) {
     return;
   }
@@ -294,6 +378,7 @@ function escalateToLead(findings: HeartbeatFindings): void {
     priority: 70,
   });
+  lastEscalationTime.set(escalationKey, Date.now());
   console.log(`[Heartbeat] Created triage task for lead ${lead.name}`);
 }
@@ -337,6 +422,7 @@ export async function runHeartbeatSweep(): Promise<void> {
     if (!preflightGate()) {
       const cleanupOnlyFindings: HeartbeatFindings = {
         stalledTasks: [],
+        autoFailedTasks: [],
         workerHealthFixes: [],
         autoAssigned: [],
         staleCleanup: {
@@ -374,6 +460,9 @@ export async function runHeartbeatSweep(): Promise<void> {
 function logFindings(findings: HeartbeatFindings): void {
   const parts: string[] = [];
+  if (findings.autoFailedTasks.length > 0) {
+    parts.push(`auto_failed=${findings.autoFailedTasks.length}`);
+  }
   if (findings.stalledTasks.length > 0) {
     parts.push(`stalled=${findings.stalledTasks.length}`);
   }
@@ -432,3 +521,10 @@ export function stopHeartbeat(): void {
     console.log("[Heartbeat] Stopped");
   }
 }
+/**
+ * Reset escalation cooldown state. Exported for testing only.
+ */
+export function resetEscalationCooldowns(): void {
+  lastEscalationTime.clear();
+}

package/src/http/agents.ts CHANGED Viewed

@@ -205,6 +205,9 @@ export async function handleAgentRegister(
           // Validates that registered happened before reconnected
           return ctx.deps.length > 0;
         },
+        // biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
+        filter: ({}, ctx) => ctx.deps.length > 0,
+        conditions: [{ timeout_ms: 86_400_000 }], // 1 day: agents may be offline for extended periods
       });
     }

package/src/http/heartbeat.ts ADDED Viewed

@@ -0,0 +1,43 @@
+import type { IncomingMessage, ServerResponse } from "node:http";
+import { runHeartbeatSweep } from "../heartbeat/heartbeat";
+import { route } from "./route-def";
+import { json } from "./utils";
+// ─── Route Definitions ───────────────────────────────────────────────────────
+const triggerSweep = route({
+  method: "post",
+  path: "/api/heartbeat/sweep",
+  pattern: ["api", "heartbeat", "sweep"],
+  summary: "Trigger an immediate heartbeat sweep",
+  tags: ["Heartbeat"],
+  responses: {
+    200: { description: "Sweep completed successfully" },
+    401: { description: "Unauthorized" },
+  },
+  auth: { apiKey: true },
+});
+// ─── Handler ─────────────────────────────────────────────────────────────────
+export async function handleHeartbeat(
+  req: IncomingMessage,
+  res: ServerResponse,
+  pathSegments: string[],
+): Promise<boolean> {
+  if (triggerSweep.match(req.method, pathSegments)) {
+    const parsed = await triggerSweep.parse(req, res, pathSegments, new URLSearchParams());
+    if (!parsed) return true;
+    try {
+      await runHeartbeatSweep();
+      json(res, { success: true, message: "Heartbeat sweep completed" });
+    } catch (err) {
+      const message = err instanceof Error ? err.message : "Unknown error during heartbeat sweep";
+      json(res, { success: false, error: message }, 500);
+    }
+    return true;
+  }
+  return false;
+}

package/src/http/index.ts CHANGED Viewed

@@ -4,7 +4,7 @@ import {
   type Server,
   type ServerResponse,
 } from "node:http";
-import { assert, initialize } from "@desplega.ai/business-use";
+import { ensure, initialize } from "@desplega.ai/business-use";
 import type { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
 import { getEnabledCapabilities, hasCapability } from "@/server";
 import { initAgentMail } from "../agentmail";
@@ -25,6 +25,7 @@ import { handleDbQuery } from "./db-query";
 import { handleEcosystem } from "./ecosystem";
 import { handleEpics } from "./epics";
 import { handleEvents } from "./events";
+import { handleHeartbeat } from "./heartbeat";
 import { handleMcp } from "./mcp";
 import { handleMcpServers } from "./mcp-servers";
 import { handleMemory } from "./memory";
@@ -120,6 +121,7 @@ const httpServer = createHttpServer(async (req, res) => {
     () => handleSkills(req, res, pathSegments, queryParams, myAgentId),
     () => handleMcpServers(req, res, pathSegments, queryParams),
     () => handleMemory(req, res, pathSegments, myAgentId),
+    () => handleHeartbeat(req, res, pathSegments),
     () => handleEvents(req, res, pathSegments, queryParams, myAgentId),
     () => handleMcp(req, res, transports),
   ];
@@ -186,7 +188,7 @@ httpServer
   .listen(port, async () => {
     console.log(`MCP HTTP server running on http://localhost:${port}/mcp`);
-    assert({
+    ensure({
       id: "listen",
       flow: "api",
       runId: globalState.__runId!,

package/src/http/poll.ts CHANGED Viewed

@@ -141,6 +141,9 @@ export async function handlePoll(
                 previousStatus: pendingTask.status,
               },
               validator: (data) => data.previousStatus === "pending",
+              // biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
+              filter: ({}, ctx) => ctx.deps.length > 0,
+              conditions: [{ timeout_ms: 300_000 }], // 5 min: polling interval + queue wait
             });
             return {