@desplega.ai/agent-swarm 1.55.0 → 1.56.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@desplega.ai/agent-swarm",
3
- "version": "1.55.0",
3
+ "version": "1.56.2",
4
4
  "description": "Multi-agent orchestration for Claude Code, Codex, Gemini CLI, and other AI coding assistants",
5
5
  "license": "MIT",
6
6
  "author": "desplega.sh <contact@desplega.sh>",
package/src/be/db.ts CHANGED
@@ -229,6 +229,7 @@ const VERSIONABLE_FIELDS: VersionableField[] = [
229
229
  "toolsMd",
230
230
  "claudeMd",
231
231
  "setupScript",
232
+ "heartbeatMd",
232
233
  ];
233
234
 
234
235
  function ensureAgentProfileColumns(database: Database): void {
@@ -399,9 +400,10 @@ function seedContextVersions(): void {
399
400
  toolsMd: string | null;
400
401
  claudeMd: string | null;
401
402
  setupScript: string | null;
403
+ heartbeatMd: string | null;
402
404
  },
403
405
  []
404
- >(`SELECT id, soulMd, identityMd, toolsMd, claudeMd, setupScript FROM agents`)
406
+ >(`SELECT id, soulMd, identityMd, toolsMd, claudeMd, setupScript, heartbeatMd FROM agents`)
405
407
  .all();
406
408
 
407
409
  for (const agent of agents) {
@@ -450,6 +452,7 @@ type AgentRow = {
450
452
  identityMd: string | null;
451
453
  setupScript: string | null;
452
454
  toolsMd: string | null;
455
+ heartbeatMd: string | null;
453
456
  lastActivityAt: string | null;
454
457
  createdAt: string;
455
458
  lastUpdatedAt: string;
@@ -471,6 +474,7 @@ function rowToAgent(row: AgentRow): Agent {
471
474
  identityMd: row.identityMd ?? undefined,
472
475
  setupScript: row.setupScript ?? undefined,
473
476
  toolsMd: row.toolsMd ?? undefined,
477
+ heartbeatMd: row.heartbeatMd ?? undefined,
474
478
  lastActivityAt: row.lastActivityAt ?? undefined,
475
479
  createdAt: row.createdAt,
476
480
  lastUpdatedAt: row.lastUpdatedAt,
@@ -2259,6 +2263,7 @@ export function updateAgentProfile(
2259
2263
  identityMd?: string;
2260
2264
  setupScript?: string;
2261
2265
  toolsMd?: string;
2266
+ heartbeatMd?: string;
2262
2267
  },
2263
2268
  meta?: VersionMeta,
2264
2269
  ): Agent | null {
@@ -2312,6 +2317,7 @@ export function updateAgentProfile(
2312
2317
  string | null,
2313
2318
  string | null,
2314
2319
  string | null,
2320
+ string | null,
2315
2321
  string,
2316
2322
  string,
2317
2323
  ]
@@ -2325,6 +2331,7 @@ export function updateAgentProfile(
2325
2331
  identityMd = COALESCE(?, identityMd),
2326
2332
  setupScript = COALESCE(?, setupScript),
2327
2333
  toolsMd = COALESCE(?, toolsMd),
2334
+ heartbeatMd = COALESCE(?, heartbeatMd),
2328
2335
  lastUpdatedAt = ?
2329
2336
  WHERE id = ? RETURNING *`,
2330
2337
  )
@@ -2337,6 +2344,7 @@ export function updateAgentProfile(
2337
2344
  updates.identityMd ?? null,
2338
2345
  updates.setupScript ?? null,
2339
2346
  updates.toolsMd ?? null,
2347
+ updates.heartbeatMd ?? null,
2340
2348
  now,
2341
2349
  id,
2342
2350
  );
@@ -1,5 +1,3 @@
1
- PRAGMA defer_foreign_keys = ON;
2
-
3
1
  -- Add 'cancelled' as a valid status for workflow runs.
4
2
  -- SQLite does not support ALTER CHECK constraints, so we recreate the table.
5
3
 
@@ -0,0 +1 @@
1
+ ALTER TABLE agents ADD COLUMN heartbeatMd TEXT DEFAULT NULL;
@@ -152,6 +152,12 @@ export function runMigrations(db: Database): void {
152
152
  }
153
153
 
154
154
  // 5. Run pending migrations
155
+ // Disable FK checks for the entire migration pass. Individual migrations
156
+ // (008, 025, 026) need to DROP + recreate tables, which can violate FKs
157
+ // mid-transaction. PRAGMA foreign_keys cannot be changed inside a transaction,
158
+ // so we disable it here, outside any transaction, and re-enable after.
159
+ db.run("PRAGMA foreign_keys = OFF");
160
+
155
161
  for (const migration of migrations) {
156
162
  const existing = applied.get(migration.version);
157
163
 
@@ -184,4 +190,6 @@ export function runMigrations(db: Database): void {
184
190
  const elapsed = (performance.now() - start).toFixed(1);
185
191
  console.debug(`[migrations] Applied: ${migration.name} (${elapsed}ms)`);
186
192
  }
193
+
194
+ db.run("PRAGMA foreign_keys = ON");
187
195
  }
@@ -1991,6 +1991,7 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
1991
1991
  let agentSetupScript: string | undefined;
1992
1992
  let agentToolsMd: string | undefined;
1993
1993
  let agentClaudeMd: string | undefined;
1994
+ let agentHeartbeatMd: string | undefined;
1994
1995
  let agentProfileName: string | undefined;
1995
1996
  let agentDescription: string | undefined;
1996
1997
  let agentSkillsSummary: { name: string; description: string }[] | undefined;
@@ -2170,6 +2171,7 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
2170
2171
  claudeMd?: string;
2171
2172
  setupScript?: string;
2172
2173
  toolsMd?: string;
2174
+ heartbeatMd?: string;
2173
2175
  name?: string;
2174
2176
  description?: string;
2175
2177
  };
@@ -2178,12 +2180,19 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
2178
2180
  agentSetupScript = profile.setupScript;
2179
2181
  agentToolsMd = profile.toolsMd;
2180
2182
  agentClaudeMd = profile.claudeMd;
2183
+ agentHeartbeatMd = profile.heartbeatMd;
2181
2184
  agentProfileName = profile.name;
2182
2185
  agentDescription = profile.description;
2183
2186
 
2184
2187
  // Generate default templates if missing (runner registers via POST /api/agents
2185
2188
  // which doesn't generate templates like join-swarm does)
2186
- if (!agentSoulMd || !agentIdentityMd || !agentToolsMd || !agentClaudeMd) {
2189
+ if (
2190
+ !agentSoulMd ||
2191
+ !agentIdentityMd ||
2192
+ !agentToolsMd ||
2193
+ !agentClaudeMd ||
2194
+ !agentHeartbeatMd
2195
+ ) {
2187
2196
  // Use already-fetched template (from pre-registration step)
2188
2197
  if (cachedTemplate) {
2189
2198
  const ctx = {
@@ -2202,6 +2211,8 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
2202
2211
  agentClaudeMd = interpolate(cachedTemplate.files.claudeMd, ctx).result;
2203
2212
  if (!agentSetupScript)
2204
2213
  agentSetupScript = interpolate(cachedTemplate.files.setupScript, ctx).result;
2214
+ if (!agentHeartbeatMd)
2215
+ agentHeartbeatMd = interpolate(cachedTemplate.files.heartbeatMd, ctx).result;
2205
2216
  console.log(`[${role}] Applied template: ${templateId}`);
2206
2217
  }
2207
2218
 
@@ -2226,6 +2237,8 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
2226
2237
  if (!profile.claudeMd && agentClaudeMd) profileUpdate.claudeMd = agentClaudeMd;
2227
2238
  if (!profile.setupScript && agentSetupScript)
2228
2239
  profileUpdate.setupScript = agentSetupScript;
2240
+ if (!profile.heartbeatMd && agentHeartbeatMd)
2241
+ profileUpdate.heartbeatMd = agentHeartbeatMd;
2229
2242
 
2230
2243
  await fetch(`${apiUrl}/api/agents/${agentId}/profile`, {
2231
2244
  method: "PUT",
@@ -2364,6 +2377,16 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
2364
2377
  }
2365
2378
  }
2366
2379
 
2380
+ // Write HEARTBEAT.md to workspace (lead's periodic checklist)
2381
+ if (agentHeartbeatMd) {
2382
+ try {
2383
+ await Bun.write("/workspace/HEARTBEAT.md", agentHeartbeatMd);
2384
+ console.log(`[${role}] Wrote HEARTBEAT.md to workspace`);
2385
+ } catch (err) {
2386
+ console.warn(`[${role}] Could not write HEARTBEAT.md: ${(err as Error).message}`);
2387
+ }
2388
+ }
2389
+
2367
2390
  // Write CLAUDE.md to workspace (agent-level instructions)
2368
2391
  if (agentClaudeMd) {
2369
2392
  try {
@@ -47,11 +47,12 @@ const STALE_CLEANUP_THRESHOLD_MINUTES = Number(process.env.HEARTBEAT_STALE_CLEAN
47
47
  /** Max pool tasks to auto-assign per sweep */
48
48
  const MAX_AUTO_ASSIGN_PER_SWEEP = Number(process.env.HEARTBEAT_MAX_AUTO_ASSIGN) || 5;
49
49
 
50
- /** Escalation cooldown: minimum time between escalations for the same task set (ms) */
51
- const ESCALATION_COOLDOWN_MS =
52
- Number(process.env.HEARTBEAT_ESCALATION_COOLDOWN_MS) || 15 * 60 * 1000;
50
+ /** Heartbeat checklist interval: how often to check HEARTBEAT.md (default: 30 min) */
51
+ const HEARTBEAT_CHECKLIST_INTERVAL_MS =
52
+ Number(process.env.HEARTBEAT_CHECKLIST_INTERVAL_MS) || 30 * 60 * 1000;
53
53
 
54
- const HEARTBEAT_ESCALATION_MARKER = "[heartbeat-escalation]";
54
+ /** Whether to disable the heartbeat checklist entirely */
55
+ const HEARTBEAT_CHECKLIST_DISABLE = Boolean(process.env.HEARTBEAT_CHECKLIST_DISABLE);
55
56
 
56
57
  // ============================================================================
57
58
  // Types
@@ -69,8 +70,6 @@ export interface HeartbeatFindings {
69
70
  inboxProcessing: number;
70
71
  workflowRuns: number;
71
72
  };
72
- escalationNeeded: boolean;
73
- escalationReason?: string;
74
73
  }
75
74
 
76
75
  // ============================================================================
@@ -78,11 +77,9 @@ export interface HeartbeatFindings {
78
77
  // ============================================================================
79
78
 
80
79
  let heartbeatInterval: ReturnType<typeof setInterval> | null = null;
80
+ let checklistInterval: ReturnType<typeof setInterval> | null = null;
81
81
  let isSweeping = false;
82
82
 
83
- /** Tracks last escalation time per escalation key to prevent spam */
84
- const lastEscalationTime: Map<string, number> = new Map();
85
-
86
83
  // ============================================================================
87
84
  // Tier 1: Preflight Gate
88
85
  // ============================================================================
@@ -133,10 +130,9 @@ export async function codeLevelTriage(): Promise<HeartbeatFindings> {
133
130
  inboxProcessing: 0,
134
131
  workflowRuns: 0,
135
132
  },
136
- escalationNeeded: false,
137
133
  };
138
134
 
139
- // 1. Detect and remediate stalled tasks (tiered: auto-fail dead workers, escalate ambiguous)
135
+ // 1. Detect and remediate stalled tasks (tiered: auto-fail dead workers)
140
136
  detectAndRemediateStalledTasks(findings);
141
137
 
142
138
  // 2. Check and fix worker health
@@ -148,9 +144,6 @@ export async function codeLevelTriage(): Promise<HeartbeatFindings> {
148
144
  // 4. Cleanup stale resources (including workflow run recovery)
149
145
  await cleanupStaleResources(findings);
150
146
 
151
- // 5. Determine if escalation is needed
152
- evaluateEscalation(findings);
153
-
154
147
  return findings;
155
148
  }
156
149
 
@@ -306,102 +299,150 @@ async function cleanupStaleResources(findings: HeartbeatFindings): Promise<void>
306
299
  }
307
300
 
308
301
  // ============================================================================
309
- // Tier 3: Escalation
302
+ // Heartbeat Checklist (HEARTBEAT.md-based periodic check)
310
303
  // ============================================================================
311
304
 
312
305
  /**
313
- * Evaluate whether findings require escalation to a Claude session (lead agent).
314
- * Only escalate for ambiguous stalls (worker alive but task not updating).
306
+ * Check if content is effectively empty (only headers, comments, empty items).
307
+ * Returns true if there are no actionable items the checklist should be skipped.
315
308
  */
316
- function evaluateEscalation(findings: HeartbeatFindings): void {
317
- if (findings.stalledTasks.length > 0) {
318
- findings.escalationNeeded = true;
319
- const taskIds = findings.stalledTasks.map((t) => t.id.slice(0, 8)).join(", ");
320
- findings.escalationReason = `${findings.stalledTasks.length} task(s) stalled with active worker (no task update for ${STALL_THRESHOLD_MINUTES}+ min): ${taskIds}`;
321
- }
322
- }
309
+ export function isEffectivelyEmpty(content: string): boolean {
310
+ const lines = content.split("\n");
311
+ let inComment = false;
323
312
 
324
- /**
325
- * Create a triage task for the lead agent to investigate ambiguous findings.
326
- */
327
- function escalateToLead(findings: HeartbeatFindings): void {
328
- const lead = getLeadAgent();
329
- if (!lead) {
330
- console.log("[Heartbeat] No lead agent found — skipping escalation");
331
- return;
332
- }
313
+ for (const line of lines) {
314
+ const trimmed = line.trim();
315
+
316
+ // Track multi-line HTML comments
317
+ if (inComment) {
318
+ if (trimmed.includes("-->")) {
319
+ inComment = false;
320
+ }
321
+ continue;
322
+ }
333
323
 
334
- const escalationKey = buildEscalationKey(findings);
324
+ if (trimmed.startsWith("<!--")) {
325
+ if (!trimmed.includes("-->")) {
326
+ inComment = true;
327
+ }
328
+ continue;
329
+ }
335
330
 
336
- // Cooldown check — prevent repeated escalations for the same task set
337
- const lastTime = lastEscalationTime.get(escalationKey);
338
- if (lastTime && Date.now() - lastTime < ESCALATION_COOLDOWN_MS) {
339
- return;
340
- }
331
+ // Skip blank lines
332
+ if (trimmed === "") continue;
341
333
 
342
- if (hasActiveEscalationTask(lead.id, escalationKey)) {
343
- return;
334
+ // Skip markdown headers
335
+ if (/^#{1,6}\s/.test(trimmed)) continue;
336
+
337
+ // Skip empty list items (just a marker with no text)
338
+ if (/^[-*+]\s*\[\s*\]\s*$/.test(trimmed)) continue;
339
+ if (/^[-*+]\s*$/.test(trimmed)) continue;
340
+
341
+ // If we get here, there's real content
342
+ return false;
344
343
  }
345
344
 
346
- // Build stalled tasks section
347
- let stalledTasksSection = "";
348
- if (findings.stalledTasks.length > 0) {
349
- const lines: string[] = ["## Stalled Tasks"];
350
- for (const task of findings.stalledTasks) {
345
+ return true;
346
+ }
347
+
348
+ /**
349
+ * Gather current system status as a markdown string for the lead's checklist task.
350
+ */
351
+ export function gatherSystemStatus(): string {
352
+ const stats = getTaskStats();
353
+ const stalledTasks = getStalledInProgressTasks(STALL_THRESHOLD_MINUTES);
354
+ const agents = getAllAgents();
355
+ const idleWorkers = getIdleWorkersWithCapacity();
356
+ const poolTasks = getUnassignedPoolTasks(10);
357
+
358
+ const sections: string[] = [];
359
+
360
+ // Task overview
361
+ sections.push("## Task Overview [auto-generated]");
362
+ sections.push(`- In Progress: ${stats.in_progress ?? 0}`);
363
+ sections.push(`- Pending: ${stats.pending ?? 0}`);
364
+ sections.push(`- Unassigned: ${stats.unassigned ?? 0}`);
365
+ sections.push(`- Completed (24h): ${stats.completed ?? 0}`);
366
+ sections.push(`- Failed (24h): ${stats.failed ?? 0}`);
367
+
368
+ // Stalled tasks
369
+ if (stalledTasks.length > 0) {
370
+ sections.push("");
371
+ sections.push("## Stalled Tasks [auto-generated]");
372
+ for (const task of stalledTasks) {
351
373
  const agentSlice = task.agentId?.slice(0, 8) ?? "unassigned";
352
- lines.push(
353
- `- Task ${task.id.slice(0, 8)} (agent: ${agentSlice}): last updated ${task.lastUpdatedAt}`,
374
+ sections.push(
375
+ `- [${task.id.slice(0, 8)}] "${task.task.slice(0, 60)}" — assigned to ${agentSlice}, last update: ${task.lastUpdatedAt}`,
354
376
  );
355
377
  }
356
- lines.push(
357
- "\nCheck if these tasks are genuinely stuck or just working without calling store-progress. " +
358
- "If stuck, consider cancelling and reassigning.",
359
- );
360
- stalledTasksSection = lines.join("\n");
361
378
  }
362
379
 
363
- const escalationMarker = `\n${HEARTBEAT_ESCALATION_MARKER} ${escalationKey}`;
364
-
365
- const result = resolveTemplate("heartbeat.escalation.stalled", {
366
- stalled_tasks_section: stalledTasksSection,
367
- escalation_marker: escalationMarker,
368
- });
380
+ // Agent status
381
+ const idle = agents.filter((a) => a.status === "idle");
382
+ const busy = agents.filter((a) => a.status === "busy");
383
+ const offline = agents.filter((a) => a.status === "offline");
384
+ sections.push("");
385
+ sections.push("## Agent Status [auto-generated]");
386
+ sections.push(
387
+ `- Online: ${idle.length + busy.length} (${idle.length} idle, ${busy.length} busy), Offline: ${offline.length}`,
388
+ );
369
389
 
370
- if (result.skipped) {
371
- return;
390
+ // Available work
391
+ if (poolTasks.length > 0 || idleWorkers.length > 0) {
392
+ sections.push("");
393
+ sections.push("## Available Work [auto-generated]");
394
+ if (poolTasks.length > 0) {
395
+ sections.push(`- ${poolTasks.length} unassigned pool task(s) waiting`);
396
+ }
397
+ if (idleWorkers.length > 0) {
398
+ sections.push(`- ${idleWorkers.length} idle worker(s) with capacity`);
399
+ }
372
400
  }
373
401
 
374
- createTaskExtended(result.text, {
375
- agentId: lead.id,
376
- taskType: "heartbeat",
377
- tags: ["heartbeat", "triage", "auto-generated"],
378
- priority: 70,
379
- });
380
-
381
- lastEscalationTime.set(escalationKey, Date.now());
382
- console.log(`[Heartbeat] Created triage task for lead ${lead.name}`);
402
+ return sections.join("\n");
383
403
  }
384
404
 
385
- function buildEscalationKey(findings: HeartbeatFindings): string {
386
- const stalledTaskIds = findings.stalledTasks
387
- .map((task) => task.id)
388
- .sort((a, b) => a.localeCompare(b));
389
- return `stalled:${stalledTaskIds.join(",")}`;
390
- }
405
+ /**
406
+ * Check HEARTBEAT.md content and create a checklist task for the lead if needed.
407
+ */
408
+ export async function checkHeartbeatChecklist(): Promise<void> {
409
+ const lead = getLeadAgent();
410
+ if (!lead) return;
411
+
412
+ const heartbeatMd = lead.heartbeatMd;
413
+ if (!heartbeatMd) return;
391
414
 
392
- function hasActiveEscalationTask(leadAgentId: string, escalationKey: string): boolean {
415
+ if (isEffectivelyEmpty(heartbeatMd)) return;
416
+
417
+ // Dedup: skip if lead already has an active heartbeat-checklist task
393
418
  const existing = getDb()
394
- .prepare<{ id: string }, [string, string]>(
419
+ .prepare<{ id: string }, [string]>(
395
420
  `SELECT id FROM agent_tasks
396
421
  WHERE agentId = ?
397
- AND taskType = 'heartbeat'
422
+ AND taskType = 'heartbeat-checklist'
398
423
  AND status NOT IN ('completed', 'failed', 'cancelled')
399
- AND task LIKE ?
400
424
  LIMIT 1`,
401
425
  )
402
- .get(leadAgentId, `%${HEARTBEAT_ESCALATION_MARKER} ${escalationKey}%`);
426
+ .get(lead.id);
427
+ if (existing) return;
428
+
429
+ const systemStatus = gatherSystemStatus();
403
430
 
404
- return Boolean(existing);
431
+ const result = resolveTemplate("heartbeat.checklist", {
432
+ system_status: systemStatus,
433
+ heartbeat_content: heartbeatMd,
434
+ });
435
+
436
+ if (result.skipped) return;
437
+
438
+ createTaskExtended(result.text, {
439
+ agentId: lead.id,
440
+ taskType: "heartbeat-checklist",
441
+ tags: ["checklist", "auto-generated"],
442
+ priority: 60,
443
+ });
444
+
445
+ console.log(`[Heartbeat] Checklist task created for lead ${lead.name}`);
405
446
  }
406
447
 
407
448
  // ============================================================================
@@ -409,7 +450,7 @@ function hasActiveEscalationTask(leadAgentId: string, escalationKey: string): bo
409
450
  // ============================================================================
410
451
 
411
452
  /**
412
- * Run a single heartbeat sweep (Tier 1 → Tier 2 → Tier 3).
453
+ * Run a single heartbeat sweep (Tier 1 → Tier 2).
413
454
  */
414
455
  export async function runHeartbeatSweep(): Promise<void> {
415
456
  if (isSweeping) {
@@ -432,7 +473,6 @@ export async function runHeartbeatSweep(): Promise<void> {
432
473
  inboxProcessing: 0,
433
474
  workflowRuns: 0,
434
475
  },
435
- escalationNeeded: false,
436
476
  };
437
477
  await cleanupStaleResources(cleanupOnlyFindings);
438
478
  logFindings(cleanupOnlyFindings);
@@ -444,11 +484,6 @@ export async function runHeartbeatSweep(): Promise<void> {
444
484
 
445
485
  // Log findings summary
446
486
  logFindings(findings);
447
-
448
- // Tier 3: Escalate if needed
449
- if (findings.escalationNeeded) {
450
- escalateToLead(findings);
451
- }
452
487
  } finally {
453
488
  isSweeping = false;
454
489
  }
@@ -508,6 +543,9 @@ export function startHeartbeat(intervalMs = DEFAULT_INTERVAL_MS): void {
508
543
  heartbeatInterval = setInterval(() => {
509
544
  runHeartbeatSweep();
510
545
  }, intervalMs);
546
+
547
+ // Also start the checklist interval
548
+ startHeartbeatChecklist();
511
549
  }
512
550
 
513
551
  /**
@@ -520,11 +558,82 @@ export function stopHeartbeat(): void {
520
558
  isSweeping = false;
521
559
  console.log("[Heartbeat] Stopped");
522
560
  }
561
+ stopHeartbeatChecklist();
562
+ }
563
+
564
+ /**
565
+ * Create a one-off boot triage task for the lead after a server restart.
566
+ * Uses the same HEARTBEAT.md content but with reboot-specific context prepended.
567
+ */
568
+ export async function createBootTriageTask(): Promise<void> {
569
+ const lead = getLeadAgent();
570
+ if (!lead) return;
571
+
572
+ const heartbeatMd = lead.heartbeatMd ?? "";
573
+
574
+ // Dedup: skip if lead already has an active boot-triage task
575
+ const existing = getDb()
576
+ .prepare<{ id: string }, [string]>(
577
+ `SELECT id FROM agent_tasks
578
+ WHERE agentId = ?
579
+ AND taskType = 'boot-triage'
580
+ AND status NOT IN ('completed', 'failed', 'cancelled')
581
+ LIMIT 1`,
582
+ )
583
+ .get(lead.id);
584
+ if (existing) return;
585
+
586
+ const systemStatus = gatherSystemStatus();
587
+
588
+ const result = resolveTemplate("heartbeat.boot-triage", {
589
+ system_status: systemStatus,
590
+ heartbeat_content: isEffectivelyEmpty(heartbeatMd)
591
+ ? "_No standing orders configured._"
592
+ : heartbeatMd,
593
+ });
594
+
595
+ if (result.skipped) return;
596
+
597
+ createTaskExtended(result.text, {
598
+ agentId: lead.id,
599
+ taskType: "boot-triage",
600
+ tags: ["boot", "triage", "auto-generated"],
601
+ priority: 70, // Higher than regular checklist (60)
602
+ });
603
+
604
+ console.log(`[Heartbeat] Boot triage task created for lead ${lead.name}`);
605
+ }
606
+
607
+ /**
608
+ * Start the heartbeat checklist polling loop (separate from the infrastructure sweep).
609
+ */
610
+ export function startHeartbeatChecklist(intervalMs = HEARTBEAT_CHECKLIST_INTERVAL_MS): void {
611
+ if (HEARTBEAT_CHECKLIST_DISABLE) {
612
+ console.log("[Heartbeat] Checklist disabled via HEARTBEAT_CHECKLIST_DISABLE");
613
+ return;
614
+ }
615
+ if (checklistInterval) {
616
+ return; // Already running
617
+ }
618
+
619
+ console.log(`[Heartbeat] Checklist starting with ${intervalMs}ms interval`);
620
+
621
+ // Boot triage replaces the first checklist (30s delay to let system settle)
622
+ setTimeout(() => createBootTriageTask(), 30_000);
623
+
624
+ // Recurring checklist starts from the second interval onward
625
+ checklistInterval = setInterval(() => {
626
+ checkHeartbeatChecklist();
627
+ }, intervalMs);
523
628
  }
524
629
 
525
630
  /**
526
- * Reset escalation cooldown state. Exported for testing only.
631
+ * Stop the heartbeat checklist polling loop.
527
632
  */
528
- export function resetEscalationCooldowns(): void {
529
- lastEscalationTime.clear();
633
+ export function stopHeartbeatChecklist(): void {
634
+ if (checklistInterval) {
635
+ clearInterval(checklistInterval);
636
+ checklistInterval = null;
637
+ console.log("[Heartbeat] Checklist stopped");
638
+ }
530
639
  }
@@ -8,23 +8,78 @@
8
8
  import { registerTemplate } from "../prompts/registry";
9
9
 
10
10
  // ============================================================================
11
- // Escalation events
11
+ // Heartbeat checklist
12
12
  // ============================================================================
13
13
 
14
14
  registerTemplate({
15
- eventType: "heartbeat.escalation.stalled",
15
+ eventType: "heartbeat.checklist",
16
16
  header: "",
17
- defaultBody: `Task Type: Triage
18
- Goal: Investigate heartbeat findings that need human reasoning
17
+ defaultBody: `Task Type: Heartbeat Checklist
18
+ Goal: Review system status and your standing orders, take action if needed.
19
19
 
20
- {{stalled_tasks_section}}
21
- {{escalation_marker}}`,
20
+ ## Current System Status [auto-generated]
21
+ {{system_status}}
22
+
23
+ ## Your Standing Orders (from HEARTBEAT.md)
24
+ {{heartbeat_content}}
25
+
26
+ ## Instructions
27
+ 1. Review the system status above for anything that needs attention (stalled tasks, idle workers with available work, anomalies).
28
+ 2. Review your standing orders for any periodic checks or actions.
29
+ 3. If something needs attention — take action now using your available tools (create tasks, post to Slack, cancel stuck tasks, etc.).
30
+ 4. If everything looks healthy and no standing orders are actionable — complete this task with a brief "All clear" summary.
31
+ 5. Do NOT create another heartbeat-checklist task — the system handles scheduling.`,
32
+ variables: [
33
+ {
34
+ name: "system_status",
35
+ description: "Auto-generated markdown section with current system status",
36
+ },
37
+ {
38
+ name: "heartbeat_content",
39
+ description: "The lead agent's HEARTBEAT.md standing orders",
40
+ },
41
+ ],
42
+ category: "event",
43
+ });
44
+
45
+ // ============================================================================
46
+ // Boot triage (one-off after container restart)
47
+ // ============================================================================
48
+
49
+ registerTemplate({
50
+ eventType: "heartbeat.boot-triage",
51
+ header: "",
52
+ defaultBody: `Task Type: Boot Triage
53
+ Goal: The system just restarted — assess current state and triage based on your role.
54
+
55
+ ## Boot Event [auto-generated]
56
+ The API server has just restarted (possible pod rotation or deployment). This is a one-off triage task — not a recurring checklist. Review the current state, identify anything that needs immediate attention, and take action.
57
+
58
+ ## Current System Status [auto-generated]
59
+ {{system_status}}
60
+
61
+ ## Your Standing Orders (from HEARTBEAT.md)
62
+ {{heartbeat_content}}
63
+
64
+ ## Instructions
65
+ 1. **Acknowledge the reboot** — note that the system just restarted and any in-flight work may have been interrupted.
66
+ 2. Review the system status above. Pay special attention to:
67
+ - Tasks that were in-progress before the restart (they may have been auto-failed by the startup sweep)
68
+ - Workers that may need to re-register
69
+ - Any stalled or orphaned work
70
+ 3. Review your standing orders for any checks that are relevant post-reboot.
71
+ 4. Take action using your available tools (re-create failed tasks, notify affected parties, etc.).
72
+ 5. Complete this task with a summary of what you found and what actions you took.
73
+ 6. Do NOT create another boot-triage task — this is a one-off event.`,
22
74
  variables: [
23
75
  {
24
- name: "stalled_tasks_section",
25
- description: "Markdown section listing stalled tasks with details",
76
+ name: "system_status",
77
+ description: "Auto-generated markdown section with current system status",
78
+ },
79
+ {
80
+ name: "heartbeat_content",
81
+ description: "The lead agent's HEARTBEAT.md standing orders",
26
82
  },
27
- { name: "escalation_marker", description: "Heartbeat escalation marker with key" },
28
83
  ],
29
84
  category: "event",
30
85
  });