@cleocode/core 2026.3.58 → 2026.3.60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/dist/agents/agent-registry.d.ts +206 -0
  2. package/dist/agents/agent-registry.d.ts.map +1 -0
  3. package/dist/agents/agent-registry.js +288 -0
  4. package/dist/agents/agent-registry.js.map +1 -0
  5. package/dist/agents/agent-schema.js +5 -0
  6. package/dist/agents/agent-schema.js.map +1 -1
  7. package/dist/agents/execution-learning.js +474 -0
  8. package/dist/agents/execution-learning.js.map +1 -0
  9. package/dist/agents/health-monitor.d.ts +161 -0
  10. package/dist/agents/health-monitor.d.ts.map +1 -0
  11. package/dist/agents/health-monitor.js +217 -0
  12. package/dist/agents/health-monitor.js.map +1 -0
  13. package/dist/agents/index.d.ts +3 -1
  14. package/dist/agents/index.d.ts.map +1 -1
  15. package/dist/agents/index.js +9 -1
  16. package/dist/agents/index.js.map +1 -1
  17. package/dist/agents/retry.d.ts +57 -4
  18. package/dist/agents/retry.d.ts.map +1 -1
  19. package/dist/agents/retry.js +57 -4
  20. package/dist/agents/retry.js.map +1 -1
  21. package/dist/backfill/index.d.ts +27 -0
  22. package/dist/backfill/index.d.ts.map +1 -1
  23. package/dist/backfill/index.js +229 -0
  24. package/dist/backfill/index.js.map +1 -0
  25. package/dist/bootstrap.d.ts +2 -1
  26. package/dist/bootstrap.d.ts.map +1 -1
  27. package/dist/bootstrap.js +135 -28
  28. package/dist/bootstrap.js.map +1 -1
  29. package/dist/cleo.d.ts +40 -0
  30. package/dist/cleo.d.ts.map +1 -1
  31. package/dist/config.js +83 -0
  32. package/dist/config.js.map +1 -1
  33. package/dist/index.d.ts +1 -0
  34. package/dist/index.d.ts.map +1 -1
  35. package/dist/index.js +1036 -536
  36. package/dist/index.js.map +4 -4
  37. package/dist/intelligence/adaptive-validation.js +497 -0
  38. package/dist/intelligence/adaptive-validation.js.map +1 -0
  39. package/dist/intelligence/impact.d.ts +34 -1
  40. package/dist/intelligence/impact.d.ts.map +1 -1
  41. package/dist/intelligence/impact.js +176 -0
  42. package/dist/intelligence/impact.js.map +1 -1
  43. package/dist/intelligence/index.d.ts +2 -2
  44. package/dist/intelligence/index.d.ts.map +1 -1
  45. package/dist/intelligence/index.js +6 -1
  46. package/dist/intelligence/index.js.map +1 -1
  47. package/dist/intelligence/types.d.ts +60 -0
  48. package/dist/intelligence/types.d.ts.map +1 -1
  49. package/dist/internal.d.ts +5 -4
  50. package/dist/internal.d.ts.map +1 -1
  51. package/dist/internal.js +11 -2
  52. package/dist/internal.js.map +1 -1
  53. package/dist/lib/index.d.ts +10 -0
  54. package/dist/lib/index.d.ts.map +1 -0
  55. package/dist/lib/index.js +10 -0
  56. package/dist/lib/index.js.map +1 -0
  57. package/dist/lib/retry.d.ts +128 -0
  58. package/dist/lib/retry.d.ts.map +1 -0
  59. package/dist/lib/retry.js +152 -0
  60. package/dist/lib/retry.js.map +1 -0
  61. package/dist/nexus/sharing/index.d.ts +48 -2
  62. package/dist/nexus/sharing/index.d.ts.map +1 -1
  63. package/dist/nexus/sharing/index.js +110 -1
  64. package/dist/nexus/sharing/index.js.map +1 -1
  65. package/dist/scaffold.d.ts.map +1 -1
  66. package/dist/scaffold.js +22 -2
  67. package/dist/scaffold.js.map +1 -1
  68. package/dist/sessions/session-enforcement.js +4 -0
  69. package/dist/sessions/session-enforcement.js.map +1 -1
  70. package/dist/stats/index.js +2 -0
  71. package/dist/stats/index.js.map +1 -1
  72. package/dist/stats/workflow-telemetry.d.ts +15 -0
  73. package/dist/stats/workflow-telemetry.d.ts.map +1 -1
  74. package/dist/stats/workflow-telemetry.js +400 -0
  75. package/dist/stats/workflow-telemetry.js.map +1 -0
  76. package/dist/store/brain-schema.js +4 -1
  77. package/dist/store/brain-schema.js.map +1 -1
  78. package/dist/store/converters.js +2 -0
  79. package/dist/store/converters.js.map +1 -1
  80. package/dist/store/cross-db-cleanup.d.ts +35 -0
  81. package/dist/store/cross-db-cleanup.d.ts.map +1 -1
  82. package/dist/store/cross-db-cleanup.js +169 -0
  83. package/dist/store/cross-db-cleanup.js.map +1 -0
  84. package/dist/store/db-helpers.js +2 -0
  85. package/dist/store/db-helpers.js.map +1 -1
  86. package/dist/store/migration-sqlite.js +5 -0
  87. package/dist/store/migration-sqlite.js.map +1 -1
  88. package/dist/store/sqlite-data-accessor.js +20 -28
  89. package/dist/store/sqlite-data-accessor.js.map +1 -1
  90. package/dist/store/sqlite.js +13 -2
  91. package/dist/store/sqlite.js.map +1 -1
  92. package/dist/store/task-store.js +4 -0
  93. package/dist/store/task-store.js.map +1 -1
  94. package/dist/store/tasks-schema.js +50 -20
  95. package/dist/store/tasks-schema.js.map +1 -1
  96. package/dist/tasks/add.js +87 -3
  97. package/dist/tasks/add.js.map +1 -1
  98. package/dist/tasks/complete.d.ts.map +1 -1
  99. package/dist/tasks/complete.js +15 -4
  100. package/dist/tasks/complete.js.map +1 -1
  101. package/dist/tasks/enforcement.d.ts.map +1 -1
  102. package/dist/tasks/enforcement.js +8 -1
  103. package/dist/tasks/enforcement.js.map +1 -1
  104. package/dist/tasks/epic-enforcement.d.ts +61 -0
  105. package/dist/tasks/epic-enforcement.d.ts.map +1 -1
  106. package/dist/tasks/epic-enforcement.js +294 -0
  107. package/dist/tasks/epic-enforcement.js.map +1 -0
  108. package/dist/tasks/index.js +1 -1
  109. package/dist/tasks/index.js.map +1 -1
  110. package/dist/tasks/pipeline-stage.d.ts +70 -1
  111. package/dist/tasks/pipeline-stage.d.ts.map +1 -1
  112. package/dist/tasks/pipeline-stage.js +248 -0
  113. package/dist/tasks/pipeline-stage.js.map +1 -0
  114. package/dist/tasks/update.js +28 -0
  115. package/dist/tasks/update.js.map +1 -1
  116. package/package.json +5 -5
  117. package/schemas/config.schema.json +37 -1547
  118. package/src/__tests__/sharing.test.ts +24 -0
  119. package/src/agents/__tests__/agent-registry.test.ts +351 -0
  120. package/src/agents/__tests__/health-monitor.test.ts +332 -0
  121. package/src/agents/agent-registry.ts +394 -0
  122. package/src/agents/health-monitor.ts +279 -0
  123. package/src/agents/index.ts +24 -1
  124. package/src/agents/retry.ts +57 -4
  125. package/src/backfill/index.ts +27 -0
  126. package/src/bootstrap.ts +171 -30
  127. package/src/cleo.ts +103 -2
  128. package/src/config.ts +3 -3
  129. package/src/index.ts +1 -0
  130. package/src/intelligence/__tests__/impact.test.ts +165 -1
  131. package/src/intelligence/impact.ts +203 -0
  132. package/src/intelligence/index.ts +3 -0
  133. package/src/intelligence/types.ts +76 -0
  134. package/src/internal.ts +20 -0
  135. package/src/lib/__tests__/retry.test.ts +321 -0
  136. package/src/lib/index.ts +16 -0
  137. package/src/lib/retry.ts +224 -0
  138. package/src/nexus/sharing/index.ts +142 -2
  139. package/src/scaffold.ts +24 -2
  140. package/src/stats/workflow-telemetry.ts +15 -0
  141. package/src/store/__tests__/session-store.test.ts +43 -7
  142. package/src/store/__tests__/task-store.test.ts +1 -1
  143. package/src/store/__tests__/test-db-helper.ts +7 -3
  144. package/src/store/cross-db-cleanup.ts +35 -0
  145. package/src/tasks/__tests__/epic-enforcement.test.ts +9 -4
  146. package/src/tasks/__tests__/minimal-test.test.ts +2 -2
  147. package/src/tasks/__tests__/update.test.ts +25 -25
  148. package/src/tasks/complete.ts +11 -6
  149. package/src/tasks/enforcement.ts +6 -3
  150. package/src/tasks/epic-enforcement.ts +61 -0
  151. package/src/tasks/pipeline-stage.ts +70 -1
  152. package/templates/config.template.json +5 -116
  153. package/templates/global-config.template.json +2 -44
@@ -0,0 +1,394 @@
1
+ /**
2
+ * Agent registry with capacity tracking for load balancing.
3
+ *
4
+ * Provides task-count-based capacity queries, specialization lookup,
5
+ * and performance recording on top of the existing `agent_instances` schema.
6
+ *
7
+ * Capacity model: each agent has a maximum of {@link MAX_TASKS_PER_AGENT}
8
+ * concurrent tasks. "Remaining capacity" is that constant minus the number of
9
+ * tasks currently assigned to an active agent instance.
10
+ *
11
+ * Specializations are stored as a `specializations` array inside the agent's
12
+ * `metadata_json` column. Use {@link updateAgentSpecializations} to write them.
13
+ *
14
+ * Performance recording delegates to the existing `recordAgentExecution`
15
+ * function in `execution-learning.ts` and wraps it with a simpler metrics
16
+ * interface suited for load-balancer callers.
17
+ *
18
+ * @module agents/agent-registry
19
+ * @task T041
20
+ * @epic T038
21
+ */
22
+
23
+ import { and, eq, inArray } from 'drizzle-orm';
24
+ import { getDb } from '../store/sqlite.js';
25
+ import { type AgentInstanceRow, type AgentType, agentInstances } from './agent-schema.js';
26
+ import {
27
+ type AgentExecutionEvent,
28
+ type AgentExecutionOutcome,
29
+ recordAgentExecution,
30
+ } from './execution-learning.js';
31
+ import { listAgentInstances } from './registry.js';
32
+
33
+ // ============================================================================
34
+ // Constants
35
+ // ============================================================================
36
+
37
+ /**
38
+ * Maximum number of tasks that can be concurrently assigned to one agent.
39
+ * Used as the upper bound for task-count-based capacity calculation.
40
+ */
41
+ export const MAX_TASKS_PER_AGENT = 5;
42
+
43
+ // ============================================================================
44
+ // Types
45
+ // ============================================================================
46
+
47
+ /**
48
+ * Task-count-based capacity for a single agent instance.
49
+ */
50
+ export interface AgentCapacity {
51
+ /** Agent instance ID. */
52
+ agentId: string;
53
+ /** Agent type classification. */
54
+ agentType: AgentType;
55
+ /** Current status of the agent. */
56
+ status: AgentInstanceRow['status'];
57
+ /** Number of tasks currently assigned to this agent. */
58
+ activeTasks: number;
59
+ /** Number of additional tasks this agent can accept (max - active). */
60
+ remainingCapacity: number;
61
+ /** Maximum tasks this agent can hold ({@link MAX_TASKS_PER_AGENT}). */
62
+ maxCapacity: number;
63
+ /** Whether this agent can accept new tasks. */
64
+ available: boolean;
65
+ }
66
+
67
+ /**
68
+ * Metrics provided when recording agent performance.
69
+ */
70
+ export interface AgentPerformanceMetrics {
71
+ /** Task ID that was processed. */
72
+ taskId: string;
73
+ /** Task type label (e.g. "epic", "task", "subtask"). */
74
+ taskType: string;
75
+ /** Outcome of the agent's work on the task. */
76
+ outcome: AgentExecutionOutcome;
77
+ /** Optional task labels for richer pattern classification. */
78
+ taskLabels?: string[];
79
+ /** Session ID the agent was operating under. */
80
+ sessionId?: string;
81
+ /** Duration of execution in milliseconds. */
82
+ durationMs?: number;
83
+ /** Error message if outcome is "failure". */
84
+ errorMessage?: string;
85
+ /** Error classification if outcome is "failure". */
86
+ errorType?: 'retriable' | 'permanent' | 'unknown';
87
+ }
88
+
89
+ // ============================================================================
90
+ // Capacity queries
91
+ // ============================================================================
92
+
93
+ /**
94
+ * Get task-count-based remaining capacity for an agent.
95
+ *
96
+ * Remaining capacity = {@link MAX_TASKS_PER_AGENT} minus the number of tasks
97
+ * currently routed to this agent instance (tracked via the `task_id` column
98
+ * on `agent_instances` — each instance handles one task at a time; child agents
99
+ * spawned by an orchestrator appear as sibling rows referencing the same
100
+ * `parent_agent_id`).
101
+ *
102
+ * For capacity purposes the "active tasks" count is derived from the number of
103
+ * non-terminal sibling rows that share the same `parent_agent_id` as this
104
+ * agent, plus 1 for the agent's own current task when `task_id` is set.
105
+ *
106
+ * @remarks
107
+ * Agents in terminal states (`stopped`, `crashed`) always return 0 remaining
108
+ * capacity because they cannot accept work.
109
+ *
110
+ * @param agentId - Agent instance ID (agt_...) to check
111
+ * @param cwd - Working directory used to resolve tasks.db path
112
+ * @returns Capacity breakdown or null if the agent does not exist
113
+ *
114
+ * @example
115
+ * ```ts
116
+ * const cap = await getAgentCapacity('agt_20260321120000_ab12cd', '/project');
117
+ * if (cap && cap.available) {
118
+ * console.log(`Agent can take ${cap.remainingCapacity} more tasks`);
119
+ * }
120
+ * ```
121
+ */
122
+ export async function getAgentCapacity(
123
+ agentId: string,
124
+ cwd?: string,
125
+ ): Promise<AgentCapacity | null> {
126
+ const db = await getDb(cwd);
127
+
128
+ const agent = await db.select().from(agentInstances).where(eq(agentInstances.id, agentId)).get();
129
+
130
+ if (!agent) return null;
131
+
132
+ // Terminal agents have zero capacity
133
+ const isTerminal = agent.status === 'stopped' || agent.status === 'crashed';
134
+ if (isTerminal) {
135
+ return {
136
+ agentId: agent.id,
137
+ agentType: agent.agentType,
138
+ status: agent.status,
139
+ activeTasks: 0,
140
+ remainingCapacity: 0,
141
+ maxCapacity: MAX_TASKS_PER_AGENT,
142
+ available: false,
143
+ };
144
+ }
145
+
146
+ // Count active child agents (subtasks delegated by this agent)
147
+ const children = await db
148
+ .select({ id: agentInstances.id })
149
+ .from(agentInstances)
150
+ .where(
151
+ and(
152
+ eq(agentInstances.parentAgentId, agentId),
153
+ inArray(agentInstances.status, ['starting', 'active', 'idle', 'error']),
154
+ ),
155
+ )
156
+ .all();
157
+
158
+ // The agent itself counts as 1 active task when it has a task assigned
159
+ const selfTask = agent.taskId != null ? 1 : 0;
160
+ const activeTasks = selfTask + children.length;
161
+ const remainingCapacity = Math.max(0, MAX_TASKS_PER_AGENT - activeTasks);
162
+
163
+ return {
164
+ agentId: agent.id,
165
+ agentType: agent.agentType,
166
+ status: agent.status,
167
+ activeTasks,
168
+ remainingCapacity,
169
+ maxCapacity: MAX_TASKS_PER_AGENT,
170
+ available: remainingCapacity > 0,
171
+ };
172
+ }
173
+
174
+ /**
175
+ * List all non-terminal agents sorted by remaining task capacity (descending).
176
+ *
177
+ * Returns agents with the most available slots first, enabling callers to
178
+ * select the least-loaded agent for new work assignment.
179
+ *
180
+ * @remarks
181
+ * Only agents in `active` or `idle` states are included — `starting` agents
182
+ * are excluded because they may not yet be ready to accept work.
183
+ * Terminal agents (`stopped`, `crashed`) are always omitted.
184
+ *
185
+ * @param agentType - Optional filter to limit results to one agent type
186
+ * @param cwd - Working directory used to resolve tasks.db path
187
+ * @returns Array of capacity entries sorted highest remaining capacity first
188
+ *
189
+ * @example
190
+ * ```ts
191
+ * const agents = await getAgentsByCapacity('executor', '/project');
192
+ * const best = agents[0]; // most available slots
193
+ * if (best && best.available) {
194
+ * await assignTask(best.agentId, taskId);
195
+ * }
196
+ * ```
197
+ */
198
+ export async function getAgentsByCapacity(
199
+ agentType?: AgentType,
200
+ cwd?: string,
201
+ ): Promise<AgentCapacity[]> {
202
+ const filters: Parameters<typeof listAgentInstances>[0] = agentType
203
+ ? { status: ['active', 'idle'] as ('active' | 'idle')[], agentType }
204
+ : { status: ['active', 'idle'] as ('active' | 'idle')[] };
205
+
206
+ const activeAgents = await listAgentInstances(filters, cwd);
207
+
208
+ const capacities = await Promise.all(
209
+ activeAgents.map((agent) => getAgentCapacity(agent.id, cwd)),
210
+ );
211
+
212
+ return capacities
213
+ .filter((c): c is AgentCapacity => c !== null)
214
+ .sort((a, b) => b.remainingCapacity - a.remainingCapacity);
215
+ }
216
+
217
+ // ============================================================================
218
+ // Specializations
219
+ // ============================================================================
220
+
221
+ /**
222
+ * Metadata shape stored in the agent_instances.metadata_json column.
223
+ * Only the subset relevant to specializations is typed here.
224
+ *
225
+ * @internal
226
+ */
227
+ interface AgentMetadata {
228
+ specializations?: string[];
229
+ [key: string]: unknown;
230
+ }
231
+
232
+ /**
233
+ * Get the specialization/skills list for an agent.
234
+ *
235
+ * Specializations are stored as a string array under the `specializations`
236
+ * key in the agent's `metadata_json` column. An empty array is returned when
237
+ * the field is absent or the agent is not found.
238
+ *
239
+ * @remarks
240
+ * Write specializations with {@link updateAgentSpecializations} when
241
+ * registering or updating an agent. The metadata column is a free-form JSON
242
+ * blob — specializations are one namespaced key inside it.
243
+ *
244
+ * @param agentId - Agent instance ID (agt_...)
245
+ * @param cwd - Working directory used to resolve tasks.db path
246
+ * @returns Array of specialization strings (empty if none recorded)
247
+ *
248
+ * @example
249
+ * ```ts
250
+ * const skills = await getAgentSpecializations('agt_20260321120000_ab12cd', '/project');
251
+ * // ['typescript', 'testing', 'documentation']
252
+ * if (skills.includes('typescript')) {
253
+ * console.log('Agent can handle TypeScript tasks');
254
+ * }
255
+ * ```
256
+ */
257
+ export async function getAgentSpecializations(agentId: string, cwd?: string): Promise<string[]> {
258
+ const db = await getDb(cwd);
259
+ const agent = await db
260
+ .select({ metadataJson: agentInstances.metadataJson })
261
+ .from(agentInstances)
262
+ .where(eq(agentInstances.id, agentId))
263
+ .get();
264
+
265
+ if (!agent) return [];
266
+
267
+ try {
268
+ const meta = JSON.parse(agent.metadataJson ?? '{}') as AgentMetadata;
269
+ const specs = meta.specializations;
270
+ if (!Array.isArray(specs)) return [];
271
+ return specs.filter((s): s is string => typeof s === 'string');
272
+ } catch {
273
+ return [];
274
+ }
275
+ }
276
+
277
+ /**
278
+ * Update the specializations list stored in an agent's metadata.
279
+ *
280
+ * Merges the new list into the existing `metadata_json` object, preserving
281
+ * any other keys already present. Returns the updated specializations list,
282
+ * or null if the agent was not found.
283
+ *
284
+ * @remarks
285
+ * This is a write-side companion to {@link getAgentSpecializations}. Call it
286
+ * after {@link registerAgent} to record the skills an agent was spawned with.
287
+ *
288
+ * @param agentId - Agent instance ID (agt_...)
289
+ * @param specializations - New specializations list (replaces existing)
290
+ * @param cwd - Working directory used to resolve tasks.db path
291
+ * @returns Updated specializations list, or null if agent not found
292
+ *
293
+ * @example
294
+ * ```ts
295
+ * await updateAgentSpecializations(
296
+ * 'agt_20260321120000_ab12cd',
297
+ * ['typescript', 'testing'],
298
+ * '/project',
299
+ * );
300
+ * ```
301
+ */
302
+ export async function updateAgentSpecializations(
303
+ agentId: string,
304
+ specializations: string[],
305
+ cwd?: string,
306
+ ): Promise<string[] | null> {
307
+ const db = await getDb(cwd);
308
+ const agent = await db
309
+ .select({ metadataJson: agentInstances.metadataJson })
310
+ .from(agentInstances)
311
+ .where(eq(agentInstances.id, agentId))
312
+ .get();
313
+
314
+ if (!agent) return null;
315
+
316
+ let existing: AgentMetadata = {};
317
+ try {
318
+ existing = JSON.parse(agent.metadataJson ?? '{}') as AgentMetadata;
319
+ } catch {
320
+ // Proceed with empty object if metadata is unparseable
321
+ }
322
+
323
+ const updated: AgentMetadata = { ...existing, specializations };
324
+ await db
325
+ .update(agentInstances)
326
+ .set({ metadataJson: JSON.stringify(updated) })
327
+ .where(eq(agentInstances.id, agentId));
328
+
329
+ return specializations;
330
+ }
331
+
332
+ // ============================================================================
333
+ // Performance recording
334
+ // ============================================================================
335
+
336
+ /**
337
+ * Record agent performance metrics to the BRAIN execution history.
338
+ *
339
+ * Translates a simplified {@link AgentPerformanceMetrics} object into the
340
+ * {@link AgentExecutionEvent} format expected by `execution-learning.ts` and
341
+ * delegates to {@link recordAgentExecution}. The agent type is resolved from
342
+ * the `agent_instances` table so callers only need to supply the agent ID.
343
+ *
344
+ * @remarks
345
+ * Recording is best-effort — if brain.db is unavailable the error is swallowed
346
+ * and null is returned, consistent with the rest of the execution-learning
347
+ * module. Agent lifecycle code is never disrupted by a brain write failure.
348
+ *
349
+ * @param agentId - Agent instance ID whose performance is being recorded
350
+ * @param metrics - Performance metrics for the task that was processed
351
+ * @param cwd - Working directory used to resolve tasks.db and brain.db paths
352
+ * @returns The brain decision ID if recorded, null on failure or not found
353
+ *
354
+ * @example
355
+ * ```ts
356
+ * const decisionId = await recordAgentPerformance('agt_20260321120000_ab12cd', {
357
+ * taskId: 'T041',
358
+ * taskType: 'task',
359
+ * outcome: 'success',
360
+ * durationMs: 4200,
361
+ * sessionId: 'ses_20260321_abc',
362
+ * }, '/project');
363
+ * ```
364
+ */
365
+ export async function recordAgentPerformance(
366
+ agentId: string,
367
+ metrics: AgentPerformanceMetrics,
368
+ cwd?: string,
369
+ ): Promise<string | null> {
370
+ const db = await getDb(cwd);
371
+ const agent = await db
372
+ .select({ agentType: agentInstances.agentType, sessionId: agentInstances.sessionId })
373
+ .from(agentInstances)
374
+ .where(eq(agentInstances.id, agentId))
375
+ .get();
376
+
377
+ if (!agent) return null;
378
+
379
+ const event: AgentExecutionEvent = {
380
+ agentId,
381
+ agentType: agent.agentType,
382
+ taskId: metrics.taskId,
383
+ taskType: metrics.taskType,
384
+ outcome: metrics.outcome,
385
+ taskLabels: metrics.taskLabels,
386
+ sessionId: metrics.sessionId ?? agent.sessionId ?? undefined,
387
+ durationMs: metrics.durationMs,
388
+ errorMessage: metrics.errorMessage,
389
+ errorType: metrics.errorType,
390
+ };
391
+
392
+ const decision = await recordAgentExecution(event, cwd);
393
+ return decision?.id ?? null;
394
+ }
@@ -0,0 +1,279 @@
1
+ /**
2
+ * Agent Health Monitoring -- Heartbeat and crash detection for live agent instances.
3
+ *
4
+ * Provides the public-facing health API specified by T039:
5
+ * - `recordHeartbeat` — update last_heartbeat for a live agent
6
+ * - `checkAgentHealth` — check health of a specific agent by ID
7
+ * - `detectStaleAgents` — find agents whose heartbeat is older than threshold
8
+ * - `detectCrashedAgents` — find active agents with no heartbeat for >3 min
9
+ *
10
+ * These functions delegate to the lower-level `registry.ts` primitives
11
+ * (`heartbeat`, `checkAgentHealth`, `listAgentInstances`) and add the
12
+ * named, task-spec-aligned surface required for T039.
13
+ *
14
+ * @module agents/health-monitor
15
+ * @task T039
16
+ * @epic T038
17
+ */
18
+
19
+ import type { AgentInstanceRow, AgentInstanceStatus } from './agent-schema.js';
20
+ import { heartbeat, listAgentInstances, markCrashed } from './registry.js';
21
+
22
+ // ============================================================================
23
+ // Constants
24
+ // ============================================================================
25
+
26
+ /** Default heartbeat interval (30 seconds) per BRAIN spec. */
27
+ export const HEARTBEAT_INTERVAL_MS = 30_000;
28
+
29
+ /** Default staleness threshold: 3 minutes without a heartbeat. */
30
+ export const STALE_THRESHOLD_MS = 3 * 60_000;
31
+
32
+ /** Statuses considered "alive" for health-check purposes. */
33
+ const ALIVE_STATUSES: AgentInstanceStatus[] = ['starting', 'active', 'idle'];
34
+
35
+ // ============================================================================
36
+ // Types
37
+ // ============================================================================
38
+
39
+ /**
40
+ * Health status of a specific agent instance.
41
+ */
42
+ export interface AgentHealthStatus {
43
+ /** Agent instance ID. */
44
+ agentId: string;
45
+ /** Current DB status. */
46
+ status: AgentInstanceStatus;
47
+ /** ISO timestamp of the last recorded heartbeat. */
48
+ lastHeartbeat: string;
49
+ /** Milliseconds since the last heartbeat (at call time). */
50
+ heartbeatAgeMs: number;
51
+ /** Whether the agent is considered healthy (heartbeat within threshold). */
52
+ healthy: boolean;
53
+ /** Whether the agent is considered stale (heartbeat older than threshold). */
54
+ stale: boolean;
55
+ /** Threshold used for staleness determination (ms). */
56
+ thresholdMs: number;
57
+ }
58
+
59
+ // ============================================================================
60
+ // recordHeartbeat
61
+ // ============================================================================
62
+
63
+ /**
64
+ * Record a heartbeat for an agent instance.
65
+ *
66
+ * Updates `last_heartbeat` to the current time and returns the agent's
67
+ * current {@link AgentInstanceStatus}. Returns `null` if the agent does not
68
+ * exist or is already in a terminal state (`stopped` / `crashed`).
69
+ *
70
+ * This is the primary mechanism by which long-running agents signal liveness.
71
+ * Call this every {@link HEARTBEAT_INTERVAL_MS} (30 s) from the agent loop.
72
+ *
73
+ * @param agentId - The agent instance ID (e.g. `agt_20260322120000_a1b2c3`)
74
+ * @param cwd - Working directory used to resolve the tasks.db path (optional)
75
+ * @returns The agent's current status, or `null` if not found / terminal
76
+ *
77
+ * @remarks
78
+ * Terminal agents (`stopped`, `crashed`) will NOT have their heartbeat
79
+ * updated — the existing status is returned as-is so callers can detect
80
+ * external shutdown signals.
81
+ *
82
+ * @example
83
+ * ```ts
84
+ * // Inside the agent's main loop:
85
+ * const heartbeatTimer = setInterval(async () => {
86
+ * const status = await recordHeartbeat(agentId);
87
+ * if (status === 'stopped' || status === null) {
88
+ * // Orchestrator shut us down — exit cleanly
89
+ * clearInterval(heartbeatTimer);
90
+ * process.exit(0);
91
+ * }
92
+ * }, HEARTBEAT_INTERVAL_MS);
93
+ * ```
94
+ */
95
+ export async function recordHeartbeat(
96
+ agentId: string,
97
+ cwd?: string,
98
+ ): Promise<AgentInstanceStatus | null> {
99
+ return heartbeat(agentId, cwd);
100
+ }
101
+
102
+ // ============================================================================
103
+ // checkAgentHealth
104
+ // ============================================================================
105
+
106
+ /**
107
+ * Check the health of a specific agent instance by ID.
108
+ *
109
+ * Queries the agent's current record and returns a structured
110
+ * {@link AgentHealthStatus} describing staleness, heartbeat age, and
111
+ * whether the agent is considered healthy relative to `thresholdMs`.
112
+ *
113
+ * Returns `null` if the agent ID is not found in the database.
114
+ *
115
+ * @param agentId - The agent instance ID to check
116
+ * @param thresholdMs - Staleness threshold in milliseconds (default: 3 minutes)
117
+ * @param cwd - Working directory used to resolve the tasks.db path (optional)
118
+ * @returns {@link AgentHealthStatus} or `null` if the agent does not exist
119
+ *
120
+ * @remarks
121
+ * Returns null if the agent is not found. A non-null result includes
122
+ * staleness status based on the threshold comparison.
123
+ *
124
+ * @example
125
+ * ```ts
126
+ * const health = await checkAgentHealth('agt_20260322120000_a1b2c3');
127
+ * if (health && health.stale) {
128
+ * console.log(`Agent stale for ${health.heartbeatAgeMs}ms — presumed crashed`);
129
+ * }
130
+ * ```
131
+ */
132
+ export async function checkAgentHealth(
133
+ agentId: string,
134
+ thresholdMs: number = STALE_THRESHOLD_MS,
135
+ cwd?: string,
136
+ ): Promise<AgentHealthStatus | null> {
137
+ const all = await listAgentInstances(undefined, cwd);
138
+ const agent = all.find((a) => a.id === agentId);
139
+ if (!agent) return null;
140
+
141
+ return buildHealthStatus(agent, thresholdMs);
142
+ }
143
+
144
+ // ============================================================================
145
+ // detectStaleAgents
146
+ // ============================================================================
147
+
148
+ /**
149
+ * Find all non-terminal agents whose last heartbeat is older than `thresholdMs`.
150
+ *
151
+ * "Stale" means an agent with status `starting`, `active`, or `idle` has
152
+ * not sent a heartbeat within the threshold window. This is a precursor to
153
+ * crash detection — a stale agent may still recover if it is under heavy load.
154
+ *
155
+ * Agents with status `stopped` or `crashed` are excluded — they are already
156
+ * in a terminal state and do not participate in the heartbeat protocol.
157
+ *
158
+ * @param thresholdMs - Staleness threshold in ms (default: 3 minutes / 180 000 ms)
159
+ * @param cwd - Working directory used to resolve the tasks.db path (optional)
160
+ * @returns Array of {@link AgentHealthStatus} for each stale agent, sorted by
161
+ * heartbeat age descending (most-stale first)
162
+ *
163
+ * @remarks
164
+ * The default threshold matches the crash-detection window specified in T039:
165
+ * "timeout detection after 3 minutes".
166
+ *
167
+ * @example
168
+ * ```ts
169
+ * const stale = await detectStaleAgents();
170
+ * for (const s of stale) {
171
+ * console.log(`${s.agentId} has been stale for ${s.heartbeatAgeMs / 1000}s`);
172
+ * }
173
+ * ```
174
+ */
175
+ export async function detectStaleAgents(
176
+ thresholdMs: number = STALE_THRESHOLD_MS,
177
+ cwd?: string,
178
+ ): Promise<AgentHealthStatus[]> {
179
+ const agents = await listAgentInstances({ status: ALIVE_STATUSES }, cwd);
180
+
181
+ return agents
182
+ .map((a) => buildHealthStatus(a, thresholdMs))
183
+ .filter((s) => s.stale)
184
+ .sort((a, b) => b.heartbeatAgeMs - a.heartbeatAgeMs);
185
+ }
186
+
187
+ // ============================================================================
188
+ // detectCrashedAgents
189
+ // ============================================================================
190
+
191
+ /**
192
+ * Find agents with status `active` whose heartbeat has been silent for
193
+ * longer than `thresholdMs`, and mark them as `crashed` in the database.
194
+ *
195
+ * An agent is considered crashed when it:
196
+ * 1. Has status `active` (not `idle`, `starting`, `stopped`, or `crashed`)
197
+ * 2. Has not sent a heartbeat for longer than `thresholdMs`
198
+ *
199
+ * Each detected agent is immediately marked `crashed` via {@link markCrashed},
200
+ * incrementing its error count and writing a reason to `agent_error_log`.
201
+ *
202
+ * @param thresholdMs - Crash threshold in ms (default: 3 minutes / 180 000 ms)
203
+ * @param cwd - Working directory used to resolve the tasks.db path (optional)
204
+ * @returns Array of agent instance rows for each agent that was just marked
205
+ * `crashed`, sorted by last heartbeat ascending (oldest first).
206
+ *
207
+ * @remarks
208
+ * This function is WRITE-side: it mutates the database. Callers should run
209
+ * it on a schedule (e.g. every 60 s) from an orchestrator or health watchdog.
210
+ * For a read-only view, use {@link detectStaleAgents} instead.
211
+ *
212
+ * @example
213
+ * ```ts
214
+ * // Inside an orchestrator health watchdog:
215
+ * const crashed = await detectCrashedAgents();
216
+ * if (crashed.length > 0) {
217
+ * logger.warn({ crashed: crashed.map(a => a.id) }, 'Agents marked crashed');
218
+ * }
219
+ * ```
220
+ */
221
+ export async function detectCrashedAgents(
222
+ thresholdMs: number = STALE_THRESHOLD_MS,
223
+ cwd?: string,
224
+ ): Promise<AgentInstanceRow[]> {
225
+ // Only consider agents that are explicitly 'active' — idle/starting agents
226
+ // may not yet have established a regular heartbeat interval.
227
+ const activeAgents = await listAgentInstances({ status: 'active' }, cwd);
228
+ const cutoff = new Date(Date.now() - thresholdMs).toISOString();
229
+
230
+ const crashed: AgentInstanceRow[] = [];
231
+
232
+ for (const agent of activeAgents) {
233
+ if (agent.lastHeartbeat < cutoff) {
234
+ const updated = await markCrashed(
235
+ agent.id,
236
+ `Heartbeat timeout — no heartbeat for >${Math.round(thresholdMs / 1000)}s`,
237
+ cwd,
238
+ );
239
+ if (updated) {
240
+ crashed.push(updated);
241
+ }
242
+ }
243
+ }
244
+
245
+ // Sort oldest-heartbeat first (most severely stale)
246
+ crashed.sort((a, b) => {
247
+ const aHb = a.lastHeartbeat ?? '';
248
+ const bHb = b.lastHeartbeat ?? '';
249
+ return aHb < bHb ? -1 : aHb > bHb ? 1 : 0;
250
+ });
251
+
252
+ return crashed;
253
+ }
254
+
255
+ // ============================================================================
256
+ // Internal helpers
257
+ // ============================================================================
258
+
259
+ /**
260
+ * Build an {@link AgentHealthStatus} from a raw agent row.
261
+ */
262
+ function buildHealthStatus(agent: AgentInstanceRow, thresholdMs: number): AgentHealthStatus {
263
+ const lastHeartbeatMs = new Date(agent.lastHeartbeat).getTime();
264
+ const heartbeatAgeMs = Date.now() - lastHeartbeatMs;
265
+ const stale = ALIVE_STATUSES.includes(agent.status as AgentInstanceStatus)
266
+ ? heartbeatAgeMs > thresholdMs
267
+ : false;
268
+ const healthy = !stale && ALIVE_STATUSES.includes(agent.status as AgentInstanceStatus);
269
+
270
+ return {
271
+ agentId: agent.id,
272
+ status: agent.status as AgentInstanceStatus,
273
+ lastHeartbeat: agent.lastHeartbeat,
274
+ heartbeatAgeMs,
275
+ healthy,
276
+ stale,
277
+ thresholdMs,
278
+ };
279
+ }