@cleocode/core 2026.3.57 → 2026.3.59

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. package/dist/agents/agent-registry.d.ts +206 -0
  2. package/dist/agents/agent-registry.d.ts.map +1 -0
  3. package/dist/agents/agent-schema.d.ts.map +1 -1
  4. package/dist/agents/execution-learning.d.ts +223 -0
  5. package/dist/agents/execution-learning.d.ts.map +1 -0
  6. package/dist/agents/health-monitor.d.ts +161 -0
  7. package/dist/agents/health-monitor.d.ts.map +1 -0
  8. package/dist/agents/index.d.ts +4 -1
  9. package/dist/agents/index.d.ts.map +1 -1
  10. package/dist/agents/retry.d.ts +57 -4
  11. package/dist/agents/retry.d.ts.map +1 -1
  12. package/dist/backfill/index.d.ts +83 -0
  13. package/dist/backfill/index.d.ts.map +1 -0
  14. package/dist/bootstrap.d.ts +1 -1
  15. package/dist/config.d.ts +47 -0
  16. package/dist/config.d.ts.map +1 -1
  17. package/dist/index.d.ts +2 -1
  18. package/dist/index.d.ts.map +1 -1
  19. package/dist/index.js +6985 -5068
  20. package/dist/index.js.map +4 -4
  21. package/dist/intelligence/adaptive-validation.d.ts +151 -0
  22. package/dist/intelligence/adaptive-validation.d.ts.map +1 -0
  23. package/dist/intelligence/impact.d.ts +34 -1
  24. package/dist/intelligence/impact.d.ts.map +1 -1
  25. package/dist/intelligence/index.d.ts +7 -2
  26. package/dist/intelligence/index.d.ts.map +1 -1
  27. package/dist/intelligence/types.d.ts +60 -0
  28. package/dist/intelligence/types.d.ts.map +1 -1
  29. package/dist/internal.d.ts +8 -4
  30. package/dist/internal.d.ts.map +1 -1
  31. package/dist/lib/index.d.ts +10 -0
  32. package/dist/lib/index.d.ts.map +1 -0
  33. package/dist/lib/retry.d.ts +128 -0
  34. package/dist/lib/retry.d.ts.map +1 -0
  35. package/dist/nexus/sharing/index.d.ts +48 -2
  36. package/dist/nexus/sharing/index.d.ts.map +1 -1
  37. package/dist/sessions/session-enforcement.d.ts.map +1 -1
  38. package/dist/stats/index.d.ts +1 -0
  39. package/dist/stats/index.d.ts.map +1 -1
  40. package/dist/stats/workflow-telemetry.d.ts +89 -0
  41. package/dist/stats/workflow-telemetry.d.ts.map +1 -0
  42. package/dist/store/brain-schema.d.ts.map +1 -1
  43. package/dist/store/converters.d.ts.map +1 -1
  44. package/dist/store/cross-db-cleanup.d.ts +93 -0
  45. package/dist/store/cross-db-cleanup.d.ts.map +1 -0
  46. package/dist/store/db-helpers.d.ts.map +1 -1
  47. package/dist/store/migration-sqlite.d.ts.map +1 -1
  48. package/dist/store/sqlite-data-accessor.d.ts.map +1 -1
  49. package/dist/store/sqlite.d.ts.map +1 -1
  50. package/dist/store/task-store.d.ts.map +1 -1
  51. package/dist/store/tasks-schema.d.ts +18 -3
  52. package/dist/store/tasks-schema.d.ts.map +1 -1
  53. package/dist/store/validation-schemas.d.ts +32 -0
  54. package/dist/store/validation-schemas.d.ts.map +1 -1
  55. package/dist/tasks/add.d.ts +10 -1
  56. package/dist/tasks/add.d.ts.map +1 -1
  57. package/dist/tasks/complete.d.ts.map +1 -1
  58. package/dist/tasks/enforcement.d.ts +22 -0
  59. package/dist/tasks/enforcement.d.ts.map +1 -0
  60. package/dist/tasks/epic-enforcement.d.ts +199 -0
  61. package/dist/tasks/epic-enforcement.d.ts.map +1 -0
  62. package/dist/tasks/index.d.ts +1 -1
  63. package/dist/tasks/index.d.ts.map +1 -1
  64. package/dist/tasks/pipeline-stage.d.ts +181 -0
  65. package/dist/tasks/pipeline-stage.d.ts.map +1 -0
  66. package/dist/tasks/update.d.ts +2 -0
  67. package/dist/tasks/update.d.ts.map +1 -1
  68. package/migrations/drizzle-brain/20260321000001_t033-brain-indexes/migration.sql +12 -0
  69. package/migrations/drizzle-brain/20260321000001_t033-brain-indexes/snapshot.json +1232 -0
  70. package/migrations/drizzle-tasks/20260321000000_t033-connection-health/migration.sql +518 -0
  71. package/migrations/drizzle-tasks/20260321000000_t033-connection-health/snapshot.json +4312 -0
  72. package/migrations/drizzle-tasks/20260321000002_t060-pipeline-stage-binding/migration.sql +82 -0
  73. package/migrations/drizzle-tasks/20260321000002_t060-pipeline-stage-binding/snapshot.json +9 -0
  74. package/package.json +5 -5
  75. package/schemas/config.schema.json +37 -1547
  76. package/src/__tests__/sharing.test.ts +24 -0
  77. package/src/agents/__tests__/agent-registry.test.ts +351 -0
  78. package/src/agents/__tests__/execution-learning.test.ts +684 -0
  79. package/src/agents/__tests__/health-monitor.test.ts +332 -0
  80. package/src/agents/__tests__/registry.test.ts +30 -2
  81. package/src/agents/agent-registry.ts +394 -0
  82. package/src/agents/agent-schema.ts +5 -0
  83. package/src/agents/execution-learning.ts +675 -0
  84. package/src/agents/health-monitor.ts +279 -0
  85. package/src/agents/index.ts +37 -1
  86. package/src/agents/retry.ts +57 -4
  87. package/src/backfill/index.ts +309 -0
  88. package/src/bootstrap.ts +1 -1
  89. package/src/config.ts +126 -0
  90. package/src/index.ts +8 -1
  91. package/src/intelligence/__tests__/adaptive-validation.test.ts +694 -0
  92. package/src/intelligence/__tests__/impact.test.ts +165 -1
  93. package/src/intelligence/adaptive-validation.ts +764 -0
  94. package/src/intelligence/impact.ts +203 -0
  95. package/src/intelligence/index.ts +19 -0
  96. package/src/intelligence/types.ts +76 -0
  97. package/src/internal.ts +39 -0
  98. package/src/lib/__tests__/retry.test.ts +321 -0
  99. package/src/lib/index.ts +16 -0
  100. package/src/lib/retry.ts +224 -0
  101. package/src/lifecycle/__tests__/chain-store.test.ts +7 -0
  102. package/src/lifecycle/__tests__/tessera-engine.test.ts +52 -0
  103. package/src/nexus/sharing/index.ts +142 -2
  104. package/src/sessions/__tests__/session-edge-cases.test.ts +24 -1
  105. package/src/sessions/session-enforcement.ts +13 -2
  106. package/src/stats/index.ts +7 -0
  107. package/src/stats/workflow-telemetry.ts +502 -0
  108. package/src/store/__tests__/migration-safety.test.ts +3 -0
  109. package/src/store/__tests__/session-store.test.ts +132 -1
  110. package/src/store/__tests__/task-store.test.ts +22 -1
  111. package/src/store/__tests__/test-db-helper.ts +29 -2
  112. package/src/store/brain-schema.ts +4 -1
  113. package/src/store/converters.ts +2 -0
  114. package/src/store/cross-db-cleanup.ts +192 -0
  115. package/src/store/db-helpers.ts +2 -0
  116. package/src/store/migration-sqlite.ts +6 -0
  117. package/src/store/sqlite-data-accessor.ts +20 -28
  118. package/src/store/sqlite.ts +14 -2
  119. package/src/store/task-store.ts +6 -0
  120. package/src/store/tasks-schema.ts +59 -20
  121. package/src/tasks/__tests__/add.test.ts +16 -0
  122. package/src/tasks/__tests__/complete-unblocks.test.ts +10 -1
  123. package/src/tasks/__tests__/complete.test.ts +11 -2
  124. package/src/tasks/__tests__/epic-enforcement.test.ts +909 -0
  125. package/src/tasks/__tests__/minimal-test.test.ts +28 -0
  126. package/src/tasks/__tests__/pipeline-stage.test.ts +403 -0
  127. package/src/tasks/__tests__/update.test.ts +40 -6
  128. package/src/tasks/add.ts +128 -2
  129. package/src/tasks/complete.ts +29 -17
  130. package/src/tasks/enforcement.ts +127 -0
  131. package/src/tasks/epic-enforcement.ts +364 -0
  132. package/src/tasks/index.ts +1 -0
  133. package/src/tasks/pipeline-stage.ts +293 -0
  134. package/src/tasks/update.ts +62 -0
  135. package/templates/config.template.json +34 -111
  136. package/templates/global-config.template.json +24 -40
@@ -0,0 +1,279 @@
1
+ /**
2
+ * Agent Health Monitoring -- Heartbeat and crash detection for live agent instances.
3
+ *
4
+ * Provides the public-facing health API specified by T039:
5
+ * - `recordHeartbeat` — update last_heartbeat for a live agent
6
+ * - `checkAgentHealth` — check health of a specific agent by ID
7
+ * - `detectStaleAgents` — find agents whose heartbeat is older than threshold
8
+ * - `detectCrashedAgents` — find active agents with no heartbeat for >3 min
9
+ *
10
+ * These functions delegate to the lower-level `registry.ts` primitives
11
+ * (`heartbeat`, `checkAgentHealth`, `listAgentInstances`) and add the
12
+ * named, task-spec-aligned surface required for T039.
13
+ *
14
+ * @module agents/health-monitor
15
+ * @task T039
16
+ * @epic T038
17
+ */
18
+
19
+ import type { AgentInstanceRow, AgentInstanceStatus } from './agent-schema.js';
20
+ import { heartbeat, listAgentInstances, markCrashed } from './registry.js';
21
+
22
+ // ============================================================================
23
+ // Constants
24
+ // ============================================================================
25
+
26
+ /** Default heartbeat interval (30 seconds) per BRAIN spec. */
27
+ export const HEARTBEAT_INTERVAL_MS = 30_000;
28
+
29
+ /** Default staleness threshold: 3 minutes without a heartbeat. */
30
+ export const STALE_THRESHOLD_MS = 3 * 60_000;
31
+
32
+ /** Statuses considered "alive" for health-check purposes. */
33
+ const ALIVE_STATUSES: AgentInstanceStatus[] = ['starting', 'active', 'idle'];
34
+
35
+ // ============================================================================
36
+ // Types
37
+ // ============================================================================
38
+
39
+ /**
40
+ * Health status of a specific agent instance.
41
+ */
42
+ export interface AgentHealthStatus {
43
+ /** Agent instance ID. */
44
+ agentId: string;
45
+ /** Current DB status. */
46
+ status: AgentInstanceStatus;
47
+ /** ISO timestamp of the last recorded heartbeat. */
48
+ lastHeartbeat: string;
49
+ /** Milliseconds since the last heartbeat (at call time). */
50
+ heartbeatAgeMs: number;
51
+ /** Whether the agent is considered healthy (heartbeat within threshold). */
52
+ healthy: boolean;
53
+ /** Whether the agent is considered stale (heartbeat older than threshold). */
54
+ stale: boolean;
55
+ /** Threshold used for staleness determination (ms). */
56
+ thresholdMs: number;
57
+ }
58
+
59
+ // ============================================================================
60
+ // recordHeartbeat
61
+ // ============================================================================
62
+
63
+ /**
64
+ * Record a heartbeat for an agent instance.
65
+ *
66
+ * Updates `last_heartbeat` to the current time and returns the agent's
67
+ * current {@link AgentInstanceStatus}. Returns `null` if the agent does not
68
+ * exist or is already in a terminal state (`stopped` / `crashed`).
69
+ *
70
+ * This is the primary mechanism by which long-running agents signal liveness.
71
+ * Call this every {@link HEARTBEAT_INTERVAL_MS} (30 s) from the agent loop.
72
+ *
73
+ * @param agentId - The agent instance ID (e.g. `agt_20260322120000_a1b2c3`)
74
+ * @param cwd - Working directory used to resolve the tasks.db path (optional)
75
+ * @returns The agent's current status, or `null` if not found / terminal
76
+ *
77
+ * @remarks
78
+ * Terminal agents (`stopped`, `crashed`) will NOT have their heartbeat
79
+ * updated — the existing status is returned as-is so callers can detect
80
+ * external shutdown signals.
81
+ *
82
+ * @example
83
+ * ```ts
84
+ * // Inside the agent's main loop:
85
+ * const heartbeatTimer = setInterval(async () => {
86
+ * const status = await recordHeartbeat(agentId);
87
+ * if (status === 'stopped' || status === null) {
88
+ * // Orchestrator shut us down — exit cleanly
89
+ * clearInterval(heartbeatTimer);
90
+ * process.exit(0);
91
+ * }
92
+ * }, HEARTBEAT_INTERVAL_MS);
93
+ * ```
94
+ */
95
+ export async function recordHeartbeat(
96
+ agentId: string,
97
+ cwd?: string,
98
+ ): Promise<AgentInstanceStatus | null> {
99
+ return heartbeat(agentId, cwd);
100
+ }
101
+
102
+ // ============================================================================
103
+ // checkAgentHealth
104
+ // ============================================================================
105
+
106
+ /**
107
+ * Check the health of a specific agent instance by ID.
108
+ *
109
+ * Queries the agent's current record and returns a structured
110
+ * {@link AgentHealthStatus} describing staleness, heartbeat age, and
111
+ * whether the agent is considered healthy relative to `thresholdMs`.
112
+ *
113
+ * Returns `null` if the agent ID is not found in the database.
114
+ *
115
+ * @param agentId - The agent instance ID to check
116
+ * @param thresholdMs - Staleness threshold in milliseconds (default: 3 minutes)
117
+ * @param cwd - Working directory used to resolve the tasks.db path (optional)
118
+ * @returns {@link AgentHealthStatus} or `null` if the agent does not exist
119
+ *
120
+ * @remarks
121
+ * Returns null if the agent is not found. A non-null result includes
122
+ * staleness status based on the threshold comparison.
123
+ *
124
+ * @example
125
+ * ```ts
126
+ * const health = await checkAgentHealth('agt_20260322120000_a1b2c3');
127
+ * if (health && health.stale) {
128
+ * console.log(`Agent stale for ${health.heartbeatAgeMs}ms — presumed crashed`);
129
+ * }
130
+ * ```
131
+ */
132
+ export async function checkAgentHealth(
133
+ agentId: string,
134
+ thresholdMs: number = STALE_THRESHOLD_MS,
135
+ cwd?: string,
136
+ ): Promise<AgentHealthStatus | null> {
137
+ const all = await listAgentInstances(undefined, cwd);
138
+ const agent = all.find((a) => a.id === agentId);
139
+ if (!agent) return null;
140
+
141
+ return buildHealthStatus(agent, thresholdMs);
142
+ }
143
+
144
+ // ============================================================================
145
+ // detectStaleAgents
146
+ // ============================================================================
147
+
148
+ /**
149
+ * Find all non-terminal agents whose last heartbeat is older than `thresholdMs`.
150
+ *
151
+ * "Stale" means an agent with status `starting`, `active`, or `idle` has
152
+ * not sent a heartbeat within the threshold window. This is a precursor to
153
+ * crash detection — a stale agent may still recover if it is under heavy load.
154
+ *
155
+ * Agents with status `stopped` or `crashed` are excluded — they are already
156
+ * in a terminal state and do not participate in the heartbeat protocol.
157
+ *
158
+ * @param thresholdMs - Staleness threshold in ms (default: 3 minutes / 180 000 ms)
159
+ * @param cwd - Working directory used to resolve the tasks.db path (optional)
160
+ * @returns Array of {@link AgentHealthStatus} for each stale agent, sorted by
161
+ * heartbeat age descending (most-stale first)
162
+ *
163
+ * @remarks
164
+ * The default threshold matches the crash-detection window specified in T039:
165
+ * "timeout detection after 3 minutes".
166
+ *
167
+ * @example
168
+ * ```ts
169
+ * const stale = await detectStaleAgents();
170
+ * for (const s of stale) {
171
+ * console.log(`${s.agentId} has been stale for ${s.heartbeatAgeMs / 1000}s`);
172
+ * }
173
+ * ```
174
+ */
175
+ export async function detectStaleAgents(
176
+ thresholdMs: number = STALE_THRESHOLD_MS,
177
+ cwd?: string,
178
+ ): Promise<AgentHealthStatus[]> {
179
+ const agents = await listAgentInstances({ status: ALIVE_STATUSES }, cwd);
180
+
181
+ return agents
182
+ .map((a) => buildHealthStatus(a, thresholdMs))
183
+ .filter((s) => s.stale)
184
+ .sort((a, b) => b.heartbeatAgeMs - a.heartbeatAgeMs);
185
+ }
186
+
187
+ // ============================================================================
188
+ // detectCrashedAgents
189
+ // ============================================================================
190
+
191
+ /**
192
+ * Find agents with status `active` whose heartbeat has been silent for
193
+ * longer than `thresholdMs`, and mark them as `crashed` in the database.
194
+ *
195
+ * An agent is considered crashed when it:
196
+ * 1. Has status `active` (not `idle`, `starting`, `stopped`, or `crashed`)
197
+ * 2. Has not sent a heartbeat for longer than `thresholdMs`
198
+ *
199
+ * Each detected agent is immediately marked `crashed` via {@link markCrashed},
200
+ * incrementing its error count and writing a reason to `agent_error_log`.
201
+ *
202
+ * @param thresholdMs - Crash threshold in ms (default: 3 minutes / 180 000 ms)
203
+ * @param cwd - Working directory used to resolve the tasks.db path (optional)
204
+ * @returns Array of agent instance rows for each agent that was just marked
205
+ * `crashed`, sorted by last heartbeat ascending (oldest first).
206
+ *
207
+ * @remarks
208
+ * This function is WRITE-side: it mutates the database. Callers should run
209
+ * it on a schedule (e.g. every 60 s) from an orchestrator or health watchdog.
210
+ * For a read-only view, use {@link detectStaleAgents} instead.
211
+ *
212
+ * @example
213
+ * ```ts
214
+ * // Inside an orchestrator health watchdog:
215
+ * const crashed = await detectCrashedAgents();
216
+ * if (crashed.length > 0) {
217
+ * logger.warn({ crashed: crashed.map(a => a.id) }, 'Agents marked crashed');
218
+ * }
219
+ * ```
220
+ */
221
+ export async function detectCrashedAgents(
222
+ thresholdMs: number = STALE_THRESHOLD_MS,
223
+ cwd?: string,
224
+ ): Promise<AgentInstanceRow[]> {
225
+ // Only consider agents that are explicitly 'active' — idle/starting agents
226
+ // may not yet have established a regular heartbeat interval.
227
+ const activeAgents = await listAgentInstances({ status: 'active' }, cwd);
228
+ const cutoff = new Date(Date.now() - thresholdMs).toISOString();
229
+
230
+ const crashed: AgentInstanceRow[] = [];
231
+
232
+ for (const agent of activeAgents) {
233
+ if (agent.lastHeartbeat < cutoff) {
234
+ const updated = await markCrashed(
235
+ agent.id,
236
+ `Heartbeat timeout — no heartbeat for >${Math.round(thresholdMs / 1000)}s`,
237
+ cwd,
238
+ );
239
+ if (updated) {
240
+ crashed.push(updated);
241
+ }
242
+ }
243
+ }
244
+
245
+ // Sort oldest-heartbeat first (most severely stale)
246
+ crashed.sort((a, b) => {
247
+ const aHb = a.lastHeartbeat ?? '';
248
+ const bHb = b.lastHeartbeat ?? '';
249
+ return aHb < bHb ? -1 : aHb > bHb ? 1 : 0;
250
+ });
251
+
252
+ return crashed;
253
+ }
254
+
255
+ // ============================================================================
256
+ // Internal helpers
257
+ // ============================================================================
258
+
259
+ /**
260
+ * Build an {@link AgentHealthStatus} from a raw agent row.
261
+ */
262
+ function buildHealthStatus(agent: AgentInstanceRow, thresholdMs: number): AgentHealthStatus {
263
+ const lastHeartbeatMs = new Date(agent.lastHeartbeat).getTime();
264
+ const heartbeatAgeMs = Date.now() - lastHeartbeatMs;
265
+ const stale = ALIVE_STATUSES.includes(agent.status as AgentInstanceStatus)
266
+ ? heartbeatAgeMs > thresholdMs
267
+ : false;
268
+ const healthy = !stale && ALIVE_STATUSES.includes(agent.status as AgentInstanceStatus);
269
+
270
+ return {
271
+ agentId: agent.id,
272
+ status: agent.status as AgentInstanceStatus,
273
+ lastHeartbeat: agent.lastHeartbeat,
274
+ heartbeatAgeMs,
275
+ healthy,
276
+ stale,
277
+ thresholdMs,
278
+ };
279
+ }
@@ -11,6 +11,17 @@
11
11
  * @module agents
12
12
  */
13
13
 
14
+ // Load-balancing registry: task-count capacity, specializations, performance recording
15
+ export {
16
+ type AgentCapacity,
17
+ type AgentPerformanceMetrics,
18
+ getAgentCapacity,
19
+ getAgentSpecializations,
20
+ getAgentsByCapacity,
21
+ MAX_TASKS_PER_AGENT,
22
+ recordAgentPerformance,
23
+ updateAgentSpecializations,
24
+ } from './agent-registry.js';
14
25
  // Schema & types
15
26
  export {
16
27
  AGENT_INSTANCE_STATUSES,
@@ -34,10 +45,35 @@ export {
34
45
  isOverloaded,
35
46
  updateCapacity,
36
47
  } from './capacity.js';
48
+ // Execution learning, failure pattern tracking, and self-healing
49
+ export {
50
+ type AgentExecutionEvent,
51
+ type AgentExecutionOutcome,
52
+ type AgentPerformanceSummary,
53
+ getAgentPerformanceHistory,
54
+ getSelfHealingSuggestions,
55
+ type HealingSuggestion,
56
+ processAgentLifecycleEvent,
57
+ recordAgentExecution,
58
+ recordFailurePattern,
59
+ storeHealingStrategy,
60
+ } from './execution-learning.js';
61
+ // Health monitoring (T039)
62
+ export {
63
+ type AgentHealthStatus,
64
+ checkAgentHealth,
65
+ detectCrashedAgents,
66
+ detectStaleAgents,
67
+ HEARTBEAT_INTERVAL_MS,
68
+ recordHeartbeat,
69
+ STALE_THRESHOLD_MS,
70
+ } from './health-monitor.js';
37
71
  // Registry (CRUD, heartbeat, health, errors)
72
+ // Note: registry.checkAgentHealth (thresholdMs, cwd) -> AgentInstanceRow[] is exported
73
+ // as findStaleAgentRows to avoid conflict with health-monitor.checkAgentHealth (T039).
38
74
  export {
39
75
  type AgentHealthReport,
40
- checkAgentHealth,
76
+ checkAgentHealth as findStaleAgentRows,
41
77
  classifyError,
42
78
  deregisterAgent,
43
79
  generateAgentId,
@@ -47,6 +47,17 @@ export const DEFAULT_RETRY_POLICY: Readonly<RetryPolicy> = Object.freeze({
47
47
 
48
48
  /**
49
49
  * Create a retry policy by merging overrides with the default policy.
50
+ *
51
+ * @remarks
52
+ * Unspecified fields fall back to {@link DEFAULT_RETRY_POLICY}.
53
+ *
54
+ * @param overrides - Partial policy to merge with defaults
55
+ * @returns A complete RetryPolicy
56
+ *
57
+ * @example
58
+ * ```ts
59
+ * const policy = createRetryPolicy({ maxRetries: 5 });
60
+ * ```
50
61
  */
51
62
  export function createRetryPolicy(overrides?: Partial<RetryPolicy>): RetryPolicy {
52
63
  return { ...DEFAULT_RETRY_POLICY, ...overrides };
@@ -55,8 +66,19 @@ export function createRetryPolicy(overrides?: Partial<RetryPolicy>): RetryPolicy
55
66
  /**
56
67
  * Calculate the delay for a given retry attempt using exponential backoff.
57
68
  *
58
- * Formula: min(baseDelay * multiplier^attempt, maxDelay) + jitter
69
+ * @remarks
70
+ * Formula: `min(baseDelay * multiplier^attempt, maxDelay) + jitter`.
59
71
  * Jitter adds 0-25% randomness to prevent thundering herd.
72
+ *
73
+ * @param attempt - Zero-based attempt index
74
+ * @param policy - Retry policy with delay configuration
75
+ * @returns Delay in milliseconds before the next attempt
76
+ *
77
+ * @example
78
+ * ```ts
79
+ * const delay = calculateDelay(1, createRetryPolicy());
80
+ * // => ~2000ms (with jitter)
81
+ * ```
60
82
  */
61
83
  export function calculateDelay(attempt: number, policy: RetryPolicy): number {
62
84
  const exponentialDelay = policy.baseDelayMs * policy.backoffMultiplier ** attempt;
@@ -73,6 +95,20 @@ export function calculateDelay(attempt: number, policy: RetryPolicy): number {
73
95
  /**
74
96
  * Determine whether an error should be retried based on its classification
75
97
  * and the retry policy.
98
+ *
99
+ * @remarks
100
+ * Permanent errors are never retried. Retriable errors are always retried
101
+ * (within attempt limits). Unknown errors defer to `policy.retryOnUnknown`.
102
+ *
103
+ * @param error - The caught error to classify
104
+ * @param attempt - Current attempt number (0-based)
105
+ * @param policy - Retry policy with limits and classification rules
106
+ * @returns True if the error should be retried
107
+ *
108
+ * @example
109
+ * ```ts
110
+ * if (shouldRetry(err, attempt, policy)) { /* retry *\/ }
111
+ * ```
76
112
  */
77
113
  export function shouldRetry(error: unknown, attempt: number, policy: RetryPolicy): boolean {
78
114
  if (attempt >= policy.maxRetries) return false;
@@ -102,13 +138,20 @@ export interface RetryResult<T> {
102
138
  /**
103
139
  * Wrap an async function with retry logic using configurable exponential backoff.
104
140
  *
105
- * The function will be retried according to the policy when retriable errors
106
- * occur. Permanent errors cause immediate failure. Unknown errors respect
107
- * the `retryOnUnknown` policy setting.
141
+ * @remarks
142
+ * Agent-specific variant that integrates with error classification from the
143
+ * agent registry. For a dependency-free generic retry, use `lib/retry.ts`.
108
144
  *
145
+ * @typeParam T - The resolved type of the async function
109
146
  * @param fn - The async function to execute with retries
110
147
  * @param policy - Retry policy (uses DEFAULT_RETRY_POLICY if not provided)
111
148
  * @returns The result of the operation with retry metadata
149
+ *
150
+ * @example
151
+ * ```ts
152
+ * const result = await withRetry(() => fetchAgentTask(agentId));
153
+ * if (!result.success) console.error(result.error);
154
+ * ```
112
155
  */
113
156
  export async function withRetry<T>(
114
157
  fn: () => Promise<T>,
@@ -176,9 +219,19 @@ export interface AgentRecoveryResult {
176
219
  * classified as 'permanent' are abandoned. Agents with retriable errors
177
220
  * are reset to 'starting' for the orchestration layer to re-assign.
178
221
  *
222
+ * @remarks
223
+ * Two-phase process: first detects stale agents via heartbeat threshold,
224
+ * then evaluates each crashed agent's error history for recoverability.
225
+ *
179
226
  * @param thresholdMs - Heartbeat threshold for crash detection (default: 30000)
180
227
  * @param cwd - Working directory
181
228
  * @returns Recovery results for each crashed agent
229
+ *
230
+ * @example
231
+ * ```ts
232
+ * const results = await recoverCrashedAgents(60_000);
233
+ * results.filter(r => r.recovered).forEach(r => console.log(r.agentId));
234
+ * ```
182
235
  */
183
236
  export async function recoverCrashedAgents(
184
237
  thresholdMs: number = 30_000,