@desplega.ai/agent-swarm 1.53.1 → 1.54.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/openapi.json +22 -1
- package/package.json +1 -1
- package/src/be/db.ts +34 -0
- package/src/be/migrations/024_add_was_paused.sql +1 -0
- package/src/commands/runner.ts +46 -1
- package/src/heartbeat/heartbeat.ts +107 -11
- package/src/http/agents.ts +3 -0
- package/src/http/heartbeat.ts +43 -0
- package/src/http/index.ts +4 -2
- package/src/http/poll.ts +3 -0
- package/src/http/tasks.ts +27 -4
- package/src/providers/pi-mono-adapter.ts +25 -0
- package/src/scheduler/scheduler.ts +1 -0
- package/src/tests/events-db.test.ts +0 -1
- package/src/tests/events-http.test.ts +10 -4
- package/src/tests/heartbeat.test.ts +148 -6
- package/src/tests/workflow-hitl-routing.test.ts +545 -0
- package/src/tools/store-progress.ts +8 -2
- package/src/types.ts +3 -0
- package/src/workflows/engine.ts +59 -18
- package/src/workflows/recovery.ts +4 -4
- package/src/workflows/resume.ts +21 -15
package/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "Agent Swarm API",
|
|
5
|
-
"version": "1.53.
|
|
5
|
+
"version": "1.53.1",
|
|
6
6
|
"description": "Multi-agent orchestration API for Claude Code, Codex, and Gemini CLI. Enables task distribution, agent communication, and service discovery.\n\nMCP tools are documented separately in [MCP.md](./MCP.md)."
|
|
7
7
|
},
|
|
8
8
|
"servers": [
|
|
@@ -2424,6 +2424,27 @@
|
|
|
2424
2424
|
}
|
|
2425
2425
|
}
|
|
2426
2426
|
},
|
|
2427
|
+
"/api/heartbeat/sweep": {
|
|
2428
|
+
"post": {
|
|
2429
|
+
"summary": "Trigger an immediate heartbeat sweep",
|
|
2430
|
+
"tags": [
|
|
2431
|
+
"Heartbeat"
|
|
2432
|
+
],
|
|
2433
|
+
"security": [
|
|
2434
|
+
{
|
|
2435
|
+
"bearerAuth": []
|
|
2436
|
+
}
|
|
2437
|
+
],
|
|
2438
|
+
"responses": {
|
|
2439
|
+
"200": {
|
|
2440
|
+
"description": "Sweep completed successfully"
|
|
2441
|
+
},
|
|
2442
|
+
"401": {
|
|
2443
|
+
"description": "Unauthorized"
|
|
2444
|
+
}
|
|
2445
|
+
}
|
|
2446
|
+
}
|
|
2447
|
+
},
|
|
2427
2448
|
"/api/memory/index": {
|
|
2428
2449
|
"post": {
|
|
2429
2450
|
"summary": "Ingest content into memory system (async embedding)",
|
package/package.json
CHANGED
package/src/be/db.ts
CHANGED
|
@@ -728,6 +728,7 @@ type AgentTaskRow = {
|
|
|
728
728
|
peakContextPercent: number | null;
|
|
729
729
|
totalContextTokensUsed: number | null;
|
|
730
730
|
contextWindowSize: number | null;
|
|
731
|
+
was_paused: number;
|
|
731
732
|
};
|
|
732
733
|
|
|
733
734
|
function rowToAgentTask(row: AgentTaskRow): AgentTask {
|
|
@@ -781,6 +782,7 @@ function rowToAgentTask(row: AgentTaskRow): AgentTask {
|
|
|
781
782
|
failureReason: row.failureReason ?? undefined,
|
|
782
783
|
output: row.output ?? undefined,
|
|
783
784
|
progress: row.progress ?? undefined,
|
|
785
|
+
wasPaused: !!row.was_paused,
|
|
784
786
|
};
|
|
785
787
|
}
|
|
786
788
|
|
|
@@ -1509,6 +1511,7 @@ export function pauseTask(id: string): AgentTask | null {
|
|
|
1509
1511
|
.prepare<AgentTaskRow, [string]>(
|
|
1510
1512
|
`UPDATE agent_tasks
|
|
1511
1513
|
SET status = 'paused',
|
|
1514
|
+
was_paused = 1,
|
|
1512
1515
|
lastUpdatedAt = strftime('%Y-%m-%dT%H:%M:%fZ', 'now')
|
|
1513
1516
|
WHERE id = ? AND status = 'in_progress'
|
|
1514
1517
|
RETURNING *`,
|
|
@@ -1543,6 +1546,7 @@ export function resumeTask(taskId: string): AgentTask | null {
|
|
|
1543
1546
|
.prepare<AgentTaskRow, [string]>(
|
|
1544
1547
|
`UPDATE agent_tasks
|
|
1545
1548
|
SET status = 'in_progress',
|
|
1549
|
+
was_paused = 1,
|
|
1546
1550
|
lastUpdatedAt = strftime('%Y-%m-%dT%H:%M:%fZ', 'now')
|
|
1547
1551
|
WHERE id = ? AND status = 'paused'
|
|
1548
1552
|
RETURNING *`,
|
|
@@ -5599,6 +5603,18 @@ export function updateActiveSessionProviderSessionId(
|
|
|
5599
5603
|
return result.changes > 0;
|
|
5600
5604
|
}
|
|
5601
5605
|
|
|
5606
|
+
/**
|
|
5607
|
+
* Get the active session for a specific task.
|
|
5608
|
+
* Used by the heartbeat to cross-reference stalled tasks with worker sessions.
|
|
5609
|
+
*/
|
|
5610
|
+
export function getActiveSessionForTask(taskId: string): ActiveSession | null {
|
|
5611
|
+
return (
|
|
5612
|
+
getDb()
|
|
5613
|
+
.prepare<ActiveSession, [string]>("SELECT * FROM active_sessions WHERE taskId = ? LIMIT 1")
|
|
5614
|
+
.get(taskId) ?? null
|
|
5615
|
+
);
|
|
5616
|
+
}
|
|
5617
|
+
|
|
5602
5618
|
/**
|
|
5603
5619
|
* Reassociate session logs from a runner session to a real task ID.
|
|
5604
5620
|
* Used when a pool task is claimed — logs were stored under a random UUID,
|
|
@@ -6222,6 +6238,24 @@ export function getStepByIdempotencyKey(key: string): WorkflowRunStep | null {
|
|
|
6222
6238
|
return row ? rowToWorkflowRunStep(row) : null;
|
|
6223
6239
|
}
|
|
6224
6240
|
|
|
6241
|
+
export function getStepCountForNode(runId: string, nodeId: string): number {
|
|
6242
|
+
const row = getDb()
|
|
6243
|
+
.prepare<{ cnt: number }, [string, string]>(
|
|
6244
|
+
"SELECT COUNT(*) as cnt FROM workflow_run_steps WHERE runId = ? AND nodeId = ?",
|
|
6245
|
+
)
|
|
6246
|
+
.get(runId, nodeId);
|
|
6247
|
+
return row?.cnt ?? 0;
|
|
6248
|
+
}
|
|
6249
|
+
|
|
6250
|
+
export function getLatestStepForNode(runId: string, nodeId: string): WorkflowRunStep | null {
|
|
6251
|
+
const row = getDb()
|
|
6252
|
+
.prepare<WorkflowRunStepRow, [string, string]>(
|
|
6253
|
+
"SELECT * FROM workflow_run_steps WHERE runId = ? AND nodeId = ? ORDER BY startedAt DESC LIMIT 1",
|
|
6254
|
+
)
|
|
6255
|
+
.get(runId, nodeId);
|
|
6256
|
+
return row ? rowToWorkflowRunStep(row) : null;
|
|
6257
|
+
}
|
|
6258
|
+
|
|
6225
6259
|
// --- Workflow Version History ---
|
|
6226
6260
|
|
|
6227
6261
|
type WorkflowVersionRow = {
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ALTER TABLE agent_tasks ADD COLUMN was_paused INTEGER NOT NULL DEFAULT 0;
|
package/src/commands/runner.ts
CHANGED
|
@@ -305,6 +305,12 @@ export function humanizeToolName(name: string): string {
|
|
|
305
305
|
export function toolCallToProgress(toolName: string, args: unknown): string | null {
|
|
306
306
|
if (SKIP_PROGRESS_TOOLS.has(toolName)) return null;
|
|
307
307
|
|
|
308
|
+
// Normalize: pi-mono uses lowercase ("read"), Claude uses PascalCase ("Read")
|
|
309
|
+
const normalized =
|
|
310
|
+
toolName.startsWith("mcp__") || toolName.includes("_")
|
|
311
|
+
? toolName
|
|
312
|
+
: toolName.charAt(0).toUpperCase() + toolName.slice(1);
|
|
313
|
+
|
|
308
314
|
const a = args as Record<string, unknown>;
|
|
309
315
|
const shortPath = (p: unknown) => {
|
|
310
316
|
if (typeof p !== "string") return "";
|
|
@@ -313,7 +319,7 @@ export function toolCallToProgress(toolName: string, args: unknown): string | nu
|
|
|
313
319
|
return parts.length > 2 ? parts.slice(-2).join("/") : p;
|
|
314
320
|
};
|
|
315
321
|
|
|
316
|
-
switch (
|
|
322
|
+
switch (normalized) {
|
|
317
323
|
case "Read":
|
|
318
324
|
return `📖 Reading ${shortPath(a.file_path)}`;
|
|
319
325
|
case "Edit":
|
|
@@ -1067,6 +1073,25 @@ async function cleanupActiveSessions(config: ApiConfig): Promise<void> {
|
|
|
1067
1073
|
}
|
|
1068
1074
|
}
|
|
1069
1075
|
|
|
1076
|
+
/** Trigger a heartbeat sweep via the API (lead startup self-check) */
|
|
1077
|
+
async function triggerHeartbeatSweep(config: ApiConfig): Promise<boolean> {
|
|
1078
|
+
try {
|
|
1079
|
+
const headers: Record<string, string> = {
|
|
1080
|
+
"Content-Type": "application/json",
|
|
1081
|
+
"X-Agent-ID": config.agentId,
|
|
1082
|
+
};
|
|
1083
|
+
if (config.apiKey) headers.Authorization = `Bearer ${config.apiKey}`;
|
|
1084
|
+
const resp = await fetch(`${config.apiUrl}/api/heartbeat/sweep`, {
|
|
1085
|
+
method: "POST",
|
|
1086
|
+
headers,
|
|
1087
|
+
});
|
|
1088
|
+
return resp.ok;
|
|
1089
|
+
} catch (err) {
|
|
1090
|
+
console.warn(`[runner] Failed to trigger heartbeat sweep: ${(err as Error).message}`);
|
|
1091
|
+
return false;
|
|
1092
|
+
}
|
|
1093
|
+
}
|
|
1094
|
+
|
|
1070
1095
|
/** Trigger types returned by the poll API */
|
|
1071
1096
|
interface Trigger {
|
|
1072
1097
|
type:
|
|
@@ -1991,6 +2016,9 @@ async function checkCompletedProcesses(
|
|
|
1991
2016
|
failureReason,
|
|
1992
2017
|
},
|
|
1993
2018
|
validator: (data) => data.exitCode === 0,
|
|
2019
|
+
// biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
|
|
2020
|
+
filter: ({}, ctx) => ctx.deps.length > 0,
|
|
2021
|
+
conditions: [{ timeout_ms: 3_600_000 }], // 1 hour: process runtime
|
|
1994
2022
|
});
|
|
1995
2023
|
|
|
1996
2024
|
// Commit channel activity cursors after successful processing
|
|
@@ -2703,6 +2731,17 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
|
|
|
2703
2731
|
}
|
|
2704
2732
|
// ========== END: Resume paused tasks ==========
|
|
2705
2733
|
|
|
2734
|
+
// ========== Lead startup self-check ==========
|
|
2735
|
+
if (isLead) {
|
|
2736
|
+
console.log(`[${role}] Running startup heartbeat sweep...`);
|
|
2737
|
+
const swept = await triggerHeartbeatSweep(apiConfig);
|
|
2738
|
+
if (swept) {
|
|
2739
|
+
console.log(`[${role}] Startup heartbeat sweep completed`);
|
|
2740
|
+
} else {
|
|
2741
|
+
console.warn(`[${role}] Startup heartbeat sweep failed (non-fatal)`);
|
|
2742
|
+
}
|
|
2743
|
+
}
|
|
2744
|
+
|
|
2706
2745
|
// Track last finished task check for leads (to avoid re-processing)
|
|
2707
2746
|
while (true) {
|
|
2708
2747
|
// Ping server on each iteration to keep status updated
|
|
@@ -2790,6 +2829,9 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
|
|
|
2790
2829
|
triggerType: trigger.type,
|
|
2791
2830
|
role,
|
|
2792
2831
|
},
|
|
2832
|
+
// biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
|
|
2833
|
+
filter: ({}, ctx) => ctx.deps.length > 0,
|
|
2834
|
+
conditions: [{ timeout_ms: 60_000 }], // 1 min: immediate after poll
|
|
2793
2835
|
});
|
|
2794
2836
|
}
|
|
2795
2837
|
|
|
@@ -3020,6 +3062,9 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
|
|
|
3020
3062
|
role,
|
|
3021
3063
|
model: taskModel,
|
|
3022
3064
|
},
|
|
3065
|
+
// biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
|
|
3066
|
+
filter: ({}, ctx) => ctx.deps.length > 0,
|
|
3067
|
+
conditions: [{ timeout_ms: 60_000 }], // 1 min: process startup
|
|
3023
3068
|
});
|
|
3024
3069
|
|
|
3025
3070
|
// Attach trigger metadata for logging
|
|
@@ -2,6 +2,9 @@ import {
|
|
|
2
2
|
claimTask,
|
|
3
3
|
cleanupStaleSessions,
|
|
4
4
|
createTaskExtended,
|
|
5
|
+
deleteActiveSession,
|
|
6
|
+
failTask,
|
|
7
|
+
getActiveSessionForTask,
|
|
5
8
|
getActiveTaskCount,
|
|
6
9
|
getAllAgents,
|
|
7
10
|
getDb,
|
|
@@ -29,15 +32,25 @@ import "./templates";
|
|
|
29
32
|
/** Default heartbeat interval: 90 seconds */
|
|
30
33
|
const DEFAULT_INTERVAL_MS = Number(process.env.HEARTBEAT_INTERVAL_MS) || 90_000;
|
|
31
34
|
|
|
32
|
-
/** Stall threshold: tasks
|
|
35
|
+
/** Stall threshold: tasks with fresh worker heartbeat but no task update for this many minutes */
|
|
33
36
|
const STALL_THRESHOLD_MINUTES = Number(process.env.HEARTBEAT_STALL_THRESHOLD_MIN) || 30;
|
|
34
37
|
|
|
38
|
+
/** Stall threshold: tasks with no active session (worker clearly dead) */
|
|
39
|
+
const STALL_THRESHOLD_NO_SESSION_MIN = Number(process.env.HEARTBEAT_STALL_NO_SESSION_MIN) || 5;
|
|
40
|
+
|
|
41
|
+
/** Stall threshold: tasks with stale worker heartbeat */
|
|
42
|
+
const STALL_THRESHOLD_STALE_HEARTBEAT_MIN = Number(process.env.HEARTBEAT_STALL_STALE_HB_MIN) || 15;
|
|
43
|
+
|
|
35
44
|
/** Stale resource cleanup threshold (minutes) */
|
|
36
45
|
const STALE_CLEANUP_THRESHOLD_MINUTES = Number(process.env.HEARTBEAT_STALE_CLEANUP_MIN) || 30;
|
|
37
46
|
|
|
38
47
|
/** Max pool tasks to auto-assign per sweep */
|
|
39
48
|
const MAX_AUTO_ASSIGN_PER_SWEEP = Number(process.env.HEARTBEAT_MAX_AUTO_ASSIGN) || 5;
|
|
40
49
|
|
|
50
|
+
/** Escalation cooldown: minimum time between escalations for the same task set (ms) */
|
|
51
|
+
const ESCALATION_COOLDOWN_MS =
|
|
52
|
+
Number(process.env.HEARTBEAT_ESCALATION_COOLDOWN_MS) || 15 * 60 * 1000;
|
|
53
|
+
|
|
41
54
|
const HEARTBEAT_ESCALATION_MARKER = "[heartbeat-escalation]";
|
|
42
55
|
|
|
43
56
|
// ============================================================================
|
|
@@ -46,6 +59,7 @@ const HEARTBEAT_ESCALATION_MARKER = "[heartbeat-escalation]";
|
|
|
46
59
|
|
|
47
60
|
export interface HeartbeatFindings {
|
|
48
61
|
stalledTasks: AgentTask[];
|
|
62
|
+
autoFailedTasks: Array<{ taskId: string; agentId: string; reason: string }>;
|
|
49
63
|
workerHealthFixes: Array<{ agentId: string; oldStatus: string; newStatus: string }>;
|
|
50
64
|
autoAssigned: Array<{ taskId: string; agentId: string }>;
|
|
51
65
|
staleCleanup: {
|
|
@@ -66,6 +80,9 @@ export interface HeartbeatFindings {
|
|
|
66
80
|
let heartbeatInterval: ReturnType<typeof setInterval> | null = null;
|
|
67
81
|
let isSweeping = false;
|
|
68
82
|
|
|
83
|
+
/** Tracks last escalation time per escalation key to prevent spam */
|
|
84
|
+
const lastEscalationTime: Map<string, number> = new Map();
|
|
85
|
+
|
|
69
86
|
// ============================================================================
|
|
70
87
|
// Tier 1: Preflight Gate
|
|
71
88
|
// ============================================================================
|
|
@@ -106,6 +123,7 @@ export function preflightGate(): boolean {
|
|
|
106
123
|
export async function codeLevelTriage(): Promise<HeartbeatFindings> {
|
|
107
124
|
const findings: HeartbeatFindings = {
|
|
108
125
|
stalledTasks: [],
|
|
126
|
+
autoFailedTasks: [],
|
|
109
127
|
workerHealthFixes: [],
|
|
110
128
|
autoAssigned: [],
|
|
111
129
|
staleCleanup: {
|
|
@@ -118,8 +136,8 @@ export async function codeLevelTriage(): Promise<HeartbeatFindings> {
|
|
|
118
136
|
escalationNeeded: false,
|
|
119
137
|
};
|
|
120
138
|
|
|
121
|
-
// 1. Detect stalled tasks
|
|
122
|
-
|
|
139
|
+
// 1. Detect and remediate stalled tasks (tiered: auto-fail dead workers, escalate ambiguous)
|
|
140
|
+
detectAndRemediateStalledTasks(findings);
|
|
123
141
|
|
|
124
142
|
// 2. Check and fix worker health
|
|
125
143
|
checkWorkerHealth(findings);
|
|
@@ -137,11 +155,72 @@ export async function codeLevelTriage(): Promise<HeartbeatFindings> {
|
|
|
137
155
|
}
|
|
138
156
|
|
|
139
157
|
/**
|
|
140
|
-
*
|
|
158
|
+
* Tiered stall detection and auto-remediation.
|
|
159
|
+
*
|
|
160
|
+
* Cross-checks stalled tasks with active_sessions to determine severity:
|
|
161
|
+
* - No active session → worker is dead → auto-fail (5 min threshold)
|
|
162
|
+
* - Stale session heartbeat → worker likely crashed → auto-fail (15 min threshold)
|
|
163
|
+
* - Fresh session heartbeat → worker alive but task stale → escalate to lead (30 min threshold)
|
|
141
164
|
*/
|
|
142
|
-
function
|
|
143
|
-
|
|
144
|
-
|
|
165
|
+
function detectAndRemediateStalledTasks(findings: HeartbeatFindings): void {
|
|
166
|
+
// Use the shortest threshold to catch all potentially stalled tasks
|
|
167
|
+
const candidates = getStalledInProgressTasks(STALL_THRESHOLD_NO_SESSION_MIN);
|
|
168
|
+
|
|
169
|
+
for (const task of candidates) {
|
|
170
|
+
if (!task.agentId) continue; // Unassigned tasks can't be stalled
|
|
171
|
+
|
|
172
|
+
const session = getActiveSessionForTask(task.id);
|
|
173
|
+
const taskAgeMs = Date.now() - new Date(task.lastUpdatedAt).getTime();
|
|
174
|
+
|
|
175
|
+
if (!session) {
|
|
176
|
+
// Case A: No active session — worker is dead
|
|
177
|
+
if (taskAgeMs >= STALL_THRESHOLD_NO_SESSION_MIN * 60 * 1000) {
|
|
178
|
+
const reason =
|
|
179
|
+
"Auto-failed by heartbeat: worker session not found (no active session for task)";
|
|
180
|
+
const failed = failTask(task.id, reason);
|
|
181
|
+
if (failed) {
|
|
182
|
+
findings.autoFailedTasks.push({ taskId: task.id, agentId: task.agentId, reason });
|
|
183
|
+
console.log(`[Heartbeat] Auto-failed task ${task.id.slice(0, 8)} — no active session`);
|
|
184
|
+
|
|
185
|
+
// Fix agent status if no other active tasks
|
|
186
|
+
const remaining = getActiveTaskCount(task.agentId);
|
|
187
|
+
if (remaining === 0) {
|
|
188
|
+
updateAgentStatus(task.agentId, "idle");
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
} else {
|
|
193
|
+
const sessionHeartbeatAgeMs = Date.now() - new Date(session.lastHeartbeatAt).getTime();
|
|
194
|
+
const isStaleHeartbeat =
|
|
195
|
+
sessionHeartbeatAgeMs >= STALL_THRESHOLD_STALE_HEARTBEAT_MIN * 60 * 1000;
|
|
196
|
+
|
|
197
|
+
if (isStaleHeartbeat) {
|
|
198
|
+
// Case B: Session exists but heartbeat is stale — worker likely crashed
|
|
199
|
+
if (taskAgeMs >= STALL_THRESHOLD_STALE_HEARTBEAT_MIN * 60 * 1000) {
|
|
200
|
+
const reason =
|
|
201
|
+
"Auto-failed by heartbeat: worker session heartbeat is stale (likely crashed)";
|
|
202
|
+
const failed = failTask(task.id, reason);
|
|
203
|
+
if (failed) {
|
|
204
|
+
findings.autoFailedTasks.push({ taskId: task.id, agentId: task.agentId, reason });
|
|
205
|
+
deleteActiveSession(task.id);
|
|
206
|
+
console.log(
|
|
207
|
+
`[Heartbeat] Auto-failed task ${task.id.slice(0, 8)} — stale session heartbeat`,
|
|
208
|
+
);
|
|
209
|
+
|
|
210
|
+
const remaining = getActiveTaskCount(task.agentId);
|
|
211
|
+
if (remaining === 0) {
|
|
212
|
+
updateAgentStatus(task.agentId, "idle");
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
} else {
|
|
217
|
+
// Case C: Session exists and heartbeat is fresh — ambiguous
|
|
218
|
+
if (taskAgeMs >= STALL_THRESHOLD_MINUTES * 60 * 1000) {
|
|
219
|
+
findings.stalledTasks.push(task);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}
|
|
145
224
|
}
|
|
146
225
|
|
|
147
226
|
/**
|
|
@@ -232,15 +311,13 @@ async function cleanupStaleResources(findings: HeartbeatFindings): Promise<void>
|
|
|
232
311
|
|
|
233
312
|
/**
|
|
234
313
|
* Evaluate whether findings require escalation to a Claude session (lead agent).
|
|
235
|
-
* Only escalate for
|
|
314
|
+
* Only escalate for ambiguous stalls (worker alive but task not updating).
|
|
236
315
|
*/
|
|
237
316
|
function evaluateEscalation(findings: HeartbeatFindings): void {
|
|
238
|
-
// Stalled tasks are ambiguous — the task might be actively worked on
|
|
239
|
-
// but the worker just hasn't called store-progress recently
|
|
240
317
|
if (findings.stalledTasks.length > 0) {
|
|
241
318
|
findings.escalationNeeded = true;
|
|
242
319
|
const taskIds = findings.stalledTasks.map((t) => t.id.slice(0, 8)).join(", ");
|
|
243
|
-
findings.escalationReason = `${findings.stalledTasks.length} task(s) stalled (no update for ${STALL_THRESHOLD_MINUTES}+ min): ${taskIds}`;
|
|
320
|
+
findings.escalationReason = `${findings.stalledTasks.length} task(s) stalled with active worker (no task update for ${STALL_THRESHOLD_MINUTES}+ min): ${taskIds}`;
|
|
244
321
|
}
|
|
245
322
|
}
|
|
246
323
|
|
|
@@ -255,6 +332,13 @@ function escalateToLead(findings: HeartbeatFindings): void {
|
|
|
255
332
|
}
|
|
256
333
|
|
|
257
334
|
const escalationKey = buildEscalationKey(findings);
|
|
335
|
+
|
|
336
|
+
// Cooldown check — prevent repeated escalations for the same task set
|
|
337
|
+
const lastTime = lastEscalationTime.get(escalationKey);
|
|
338
|
+
if (lastTime && Date.now() - lastTime < ESCALATION_COOLDOWN_MS) {
|
|
339
|
+
return;
|
|
340
|
+
}
|
|
341
|
+
|
|
258
342
|
if (hasActiveEscalationTask(lead.id, escalationKey)) {
|
|
259
343
|
return;
|
|
260
344
|
}
|
|
@@ -294,6 +378,7 @@ function escalateToLead(findings: HeartbeatFindings): void {
|
|
|
294
378
|
priority: 70,
|
|
295
379
|
});
|
|
296
380
|
|
|
381
|
+
lastEscalationTime.set(escalationKey, Date.now());
|
|
297
382
|
console.log(`[Heartbeat] Created triage task for lead ${lead.name}`);
|
|
298
383
|
}
|
|
299
384
|
|
|
@@ -337,6 +422,7 @@ export async function runHeartbeatSweep(): Promise<void> {
|
|
|
337
422
|
if (!preflightGate()) {
|
|
338
423
|
const cleanupOnlyFindings: HeartbeatFindings = {
|
|
339
424
|
stalledTasks: [],
|
|
425
|
+
autoFailedTasks: [],
|
|
340
426
|
workerHealthFixes: [],
|
|
341
427
|
autoAssigned: [],
|
|
342
428
|
staleCleanup: {
|
|
@@ -374,6 +460,9 @@ export async function runHeartbeatSweep(): Promise<void> {
|
|
|
374
460
|
function logFindings(findings: HeartbeatFindings): void {
|
|
375
461
|
const parts: string[] = [];
|
|
376
462
|
|
|
463
|
+
if (findings.autoFailedTasks.length > 0) {
|
|
464
|
+
parts.push(`auto_failed=${findings.autoFailedTasks.length}`);
|
|
465
|
+
}
|
|
377
466
|
if (findings.stalledTasks.length > 0) {
|
|
378
467
|
parts.push(`stalled=${findings.stalledTasks.length}`);
|
|
379
468
|
}
|
|
@@ -432,3 +521,10 @@ export function stopHeartbeat(): void {
|
|
|
432
521
|
console.log("[Heartbeat] Stopped");
|
|
433
522
|
}
|
|
434
523
|
}
|
|
524
|
+
|
|
525
|
+
/**
|
|
526
|
+
* Reset escalation cooldown state. Exported for testing only.
|
|
527
|
+
*/
|
|
528
|
+
export function resetEscalationCooldowns(): void {
|
|
529
|
+
lastEscalationTime.clear();
|
|
530
|
+
}
|
package/src/http/agents.ts
CHANGED
|
@@ -205,6 +205,9 @@ export async function handleAgentRegister(
|
|
|
205
205
|
// Validates that registered happened before reconnected
|
|
206
206
|
return ctx.deps.length > 0;
|
|
207
207
|
},
|
|
208
|
+
// biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
|
|
209
|
+
filter: ({}, ctx) => ctx.deps.length > 0,
|
|
210
|
+
conditions: [{ timeout_ms: 86_400_000 }], // 1 day: agents may be offline for extended periods
|
|
208
211
|
});
|
|
209
212
|
}
|
|
210
213
|
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import type { IncomingMessage, ServerResponse } from "node:http";
|
|
2
|
+
import { runHeartbeatSweep } from "../heartbeat/heartbeat";
|
|
3
|
+
import { route } from "./route-def";
|
|
4
|
+
import { json } from "./utils";
|
|
5
|
+
|
|
6
|
+
// ─── Route Definitions ───────────────────────────────────────────────────────
|
|
7
|
+
|
|
8
|
+
const triggerSweep = route({
|
|
9
|
+
method: "post",
|
|
10
|
+
path: "/api/heartbeat/sweep",
|
|
11
|
+
pattern: ["api", "heartbeat", "sweep"],
|
|
12
|
+
summary: "Trigger an immediate heartbeat sweep",
|
|
13
|
+
tags: ["Heartbeat"],
|
|
14
|
+
responses: {
|
|
15
|
+
200: { description: "Sweep completed successfully" },
|
|
16
|
+
401: { description: "Unauthorized" },
|
|
17
|
+
},
|
|
18
|
+
auth: { apiKey: true },
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
// ─── Handler ─────────────────────────────────────────────────────────────────
|
|
22
|
+
|
|
23
|
+
export async function handleHeartbeat(
|
|
24
|
+
req: IncomingMessage,
|
|
25
|
+
res: ServerResponse,
|
|
26
|
+
pathSegments: string[],
|
|
27
|
+
): Promise<boolean> {
|
|
28
|
+
if (triggerSweep.match(req.method, pathSegments)) {
|
|
29
|
+
const parsed = await triggerSweep.parse(req, res, pathSegments, new URLSearchParams());
|
|
30
|
+
if (!parsed) return true;
|
|
31
|
+
|
|
32
|
+
try {
|
|
33
|
+
await runHeartbeatSweep();
|
|
34
|
+
json(res, { success: true, message: "Heartbeat sweep completed" });
|
|
35
|
+
} catch (err) {
|
|
36
|
+
const message = err instanceof Error ? err.message : "Unknown error during heartbeat sweep";
|
|
37
|
+
json(res, { success: false, error: message }, 500);
|
|
38
|
+
}
|
|
39
|
+
return true;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return false;
|
|
43
|
+
}
|
package/src/http/index.ts
CHANGED
|
@@ -4,7 +4,7 @@ import {
|
|
|
4
4
|
type Server,
|
|
5
5
|
type ServerResponse,
|
|
6
6
|
} from "node:http";
|
|
7
|
-
import {
|
|
7
|
+
import { ensure, initialize } from "@desplega.ai/business-use";
|
|
8
8
|
import type { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
|
|
9
9
|
import { getEnabledCapabilities, hasCapability } from "@/server";
|
|
10
10
|
import { initAgentMail } from "../agentmail";
|
|
@@ -25,6 +25,7 @@ import { handleDbQuery } from "./db-query";
|
|
|
25
25
|
import { handleEcosystem } from "./ecosystem";
|
|
26
26
|
import { handleEpics } from "./epics";
|
|
27
27
|
import { handleEvents } from "./events";
|
|
28
|
+
import { handleHeartbeat } from "./heartbeat";
|
|
28
29
|
import { handleMcp } from "./mcp";
|
|
29
30
|
import { handleMcpServers } from "./mcp-servers";
|
|
30
31
|
import { handleMemory } from "./memory";
|
|
@@ -120,6 +121,7 @@ const httpServer = createHttpServer(async (req, res) => {
|
|
|
120
121
|
() => handleSkills(req, res, pathSegments, queryParams, myAgentId),
|
|
121
122
|
() => handleMcpServers(req, res, pathSegments, queryParams),
|
|
122
123
|
() => handleMemory(req, res, pathSegments, myAgentId),
|
|
124
|
+
() => handleHeartbeat(req, res, pathSegments),
|
|
123
125
|
() => handleEvents(req, res, pathSegments, queryParams, myAgentId),
|
|
124
126
|
() => handleMcp(req, res, transports),
|
|
125
127
|
];
|
|
@@ -186,7 +188,7 @@ httpServer
|
|
|
186
188
|
.listen(port, async () => {
|
|
187
189
|
console.log(`MCP HTTP server running on http://localhost:${port}/mcp`);
|
|
188
190
|
|
|
189
|
-
|
|
191
|
+
ensure({
|
|
190
192
|
id: "listen",
|
|
191
193
|
flow: "api",
|
|
192
194
|
runId: globalState.__runId!,
|
package/src/http/poll.ts
CHANGED
|
@@ -141,6 +141,9 @@ export async function handlePoll(
|
|
|
141
141
|
previousStatus: pendingTask.status,
|
|
142
142
|
},
|
|
143
143
|
validator: (data) => data.previousStatus === "pending",
|
|
144
|
+
// biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
|
|
145
|
+
filter: ({}, ctx) => ctx.deps.length > 0,
|
|
146
|
+
conditions: [{ timeout_ms: 300_000 }], // 5 min: polling interval + queue wait
|
|
144
147
|
});
|
|
145
148
|
|
|
146
149
|
return {
|
package/src/http/tasks.ts
CHANGED
|
@@ -344,20 +344,32 @@ export async function handleTasks(
|
|
|
344
344
|
reason,
|
|
345
345
|
},
|
|
346
346
|
validator: (data) => data.previousStatus === "pending",
|
|
347
|
+
// biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
|
|
348
|
+
filter: ({}, ctx) => ctx.deps.length > 0,
|
|
349
|
+
conditions: [{ timeout_ms: 86_400_000 }], // 1 day: task may sit pending for a long time
|
|
347
350
|
});
|
|
348
351
|
} else {
|
|
349
352
|
ensure({
|
|
350
353
|
id: "cancelled_in_progress",
|
|
351
354
|
flow: "task",
|
|
352
355
|
runId: parsed.params.id,
|
|
353
|
-
depIds:
|
|
356
|
+
depIds:
|
|
357
|
+
task.status === "paused"
|
|
358
|
+
? ["started", "paused"]
|
|
359
|
+
: task.wasPaused
|
|
360
|
+
? ["started", "resumed"]
|
|
361
|
+
: ["started"],
|
|
354
362
|
data: {
|
|
355
363
|
taskId: parsed.params.id,
|
|
356
364
|
agentId: task.agentId,
|
|
357
365
|
previousStatus: task.status,
|
|
358
366
|
reason,
|
|
359
367
|
},
|
|
360
|
-
validator: (data) =>
|
|
368
|
+
validator: (data) =>
|
|
369
|
+
data.previousStatus === "in_progress" || data.previousStatus === "paused",
|
|
370
|
+
// biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
|
|
371
|
+
filter: ({}, ctx) => ctx.deps.length > 0,
|
|
372
|
+
conditions: [{ timeout_ms: 3_600_000 }], // 1 hour: task running time
|
|
361
373
|
});
|
|
362
374
|
}
|
|
363
375
|
|
|
@@ -423,6 +435,8 @@ export async function handleTasks(
|
|
|
423
435
|
return { task, alreadyFinished: true };
|
|
424
436
|
}
|
|
425
437
|
|
|
438
|
+
const wasPaused = task.wasPaused;
|
|
439
|
+
|
|
426
440
|
let updatedTask: typeof task;
|
|
427
441
|
if (parsed.body.status === "completed") {
|
|
428
442
|
const result = completeTask(
|
|
@@ -448,7 +462,7 @@ export async function handleTasks(
|
|
|
448
462
|
updateAgentStatusFromCapacity(task.agentId);
|
|
449
463
|
}
|
|
450
464
|
|
|
451
|
-
return { task: updatedTask };
|
|
465
|
+
return { task: updatedTask, wasPaused };
|
|
452
466
|
})();
|
|
453
467
|
|
|
454
468
|
if ("error" in result && result.error) {
|
|
@@ -462,7 +476,7 @@ export async function handleTasks(
|
|
|
462
476
|
id: finishEventId,
|
|
463
477
|
flow: "task",
|
|
464
478
|
runId: parsed.params.id,
|
|
465
|
-
depIds: ["started"],
|
|
479
|
+
depIds: result.wasPaused ? ["started", "resumed"] : ["started"],
|
|
466
480
|
data: {
|
|
467
481
|
taskId: parsed.params.id,
|
|
468
482
|
agentId: myAgentId,
|
|
@@ -472,6 +486,9 @@ export async function handleTasks(
|
|
|
472
486
|
: { failureReason: parsed.body.failureReason }),
|
|
473
487
|
},
|
|
474
488
|
validator: (data) => data.previousStatus === "in_progress",
|
|
489
|
+
// biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
|
|
490
|
+
filter: ({}, ctx) => ctx.deps.length > 0,
|
|
491
|
+
conditions: [{ timeout_ms: 3_600_000 }], // 1 hour: task running time
|
|
475
492
|
});
|
|
476
493
|
}
|
|
477
494
|
|
|
@@ -530,6 +547,9 @@ export async function handleTasks(
|
|
|
530
547
|
previousStatus: task.status,
|
|
531
548
|
},
|
|
532
549
|
validator: (data) => data.previousStatus === "in_progress",
|
|
550
|
+
// biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
|
|
551
|
+
filter: ({}, ctx) => ctx.deps.length > 0,
|
|
552
|
+
conditions: [{ timeout_ms: 3_600_000 }], // 1 hour
|
|
533
553
|
});
|
|
534
554
|
|
|
535
555
|
json(res, { success: true, task: pausedTask });
|
|
@@ -585,6 +605,9 @@ export async function handleTasks(
|
|
|
585
605
|
previousStatus: task.status,
|
|
586
606
|
},
|
|
587
607
|
validator: (data) => data.previousStatus === "paused",
|
|
608
|
+
// biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
|
|
609
|
+
filter: ({}, ctx) => ctx.deps.length > 0,
|
|
610
|
+
conditions: [{ timeout_ms: 86_400_000 }], // 1 day: tasks may stay paused for extended periods
|
|
588
611
|
});
|
|
589
612
|
|
|
590
613
|
json(res, { success: true, task: resumedTask });
|
|
@@ -207,6 +207,17 @@ class PiMonoSession implements ProviderSession {
|
|
|
207
207
|
this.lastEmittedMessage = text;
|
|
208
208
|
}
|
|
209
209
|
}
|
|
210
|
+
// Emit context_usage for dashboard tracking
|
|
211
|
+
const usage = this.agentSession.getContextUsage();
|
|
212
|
+
if (usage && usage.tokens != null) {
|
|
213
|
+
this.emit({
|
|
214
|
+
type: "context_usage",
|
|
215
|
+
contextUsedTokens: usage.tokens,
|
|
216
|
+
contextTotalTokens: usage.contextWindow,
|
|
217
|
+
contextPercent: usage.percent ?? 0,
|
|
218
|
+
outputTokens: 0,
|
|
219
|
+
});
|
|
220
|
+
}
|
|
210
221
|
break;
|
|
211
222
|
}
|
|
212
223
|
case "tool_execution_start": {
|
|
@@ -224,6 +235,13 @@ class PiMonoSession implements ProviderSession {
|
|
|
224
235
|
},
|
|
225
236
|
}),
|
|
226
237
|
});
|
|
238
|
+
// Emit normalized tool_start for runner auto-progress
|
|
239
|
+
this.emit({
|
|
240
|
+
type: "tool_start",
|
|
241
|
+
toolCallId: event.toolCallId,
|
|
242
|
+
toolName: event.toolName,
|
|
243
|
+
args: event.args,
|
|
244
|
+
});
|
|
227
245
|
break;
|
|
228
246
|
}
|
|
229
247
|
case "tool_execution_end":
|
|
@@ -244,6 +262,13 @@ class PiMonoSession implements ProviderSession {
|
|
|
244
262
|
},
|
|
245
263
|
}),
|
|
246
264
|
});
|
|
265
|
+
// Emit normalized tool_end
|
|
266
|
+
this.emit({
|
|
267
|
+
type: "tool_end",
|
|
268
|
+
toolCallId: event.toolCallId,
|
|
269
|
+
toolName: event.toolName,
|
|
270
|
+
result: event.result,
|
|
271
|
+
});
|
|
247
272
|
break;
|
|
248
273
|
case "auto_retry_start":
|
|
249
274
|
this.emit({
|
|
@@ -276,6 +276,7 @@ export function startScheduler(
|
|
|
276
276
|
const start = ctx.deps.find((d) => d.id === "listen");
|
|
277
277
|
return !!start && start.data?.capabilities?.includes("scheduling");
|
|
278
278
|
},
|
|
279
|
+
conditions: [{ timeout_ms: 10_000 }], // 10s: scheduler starts immediately after listen
|
|
279
280
|
});
|
|
280
281
|
}
|
|
281
282
|
|