@stoneforge/smithy 1.1.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. package/README.md +109 -18
  2. package/dist/cli/commands/agent.js +10 -10
  3. package/dist/cli/commands/agent.js.map +1 -1
  4. package/dist/cli/commands/pool.d.ts.map +1 -1
  5. package/dist/cli/commands/pool.js +33 -16
  6. package/dist/cli/commands/pool.js.map +1 -1
  7. package/dist/cli/commands/serve.d.ts +4 -16
  8. package/dist/cli/commands/serve.d.ts.map +1 -1
  9. package/dist/cli/commands/serve.js +0 -1
  10. package/dist/cli/commands/serve.js.map +1 -1
  11. package/dist/index.d.ts +2 -0
  12. package/dist/index.d.ts.map +1 -1
  13. package/dist/index.js +2 -0
  14. package/dist/index.js.map +1 -1
  15. package/dist/prompts/director.md +2 -2
  16. package/dist/prompts/index.d.ts.map +1 -1
  17. package/dist/prompts/index.js +6 -3
  18. package/dist/prompts/index.js.map +1 -1
  19. package/dist/prompts/persistent-worker.md +1 -1
  20. package/dist/prompts/steward-base.md +4 -4
  21. package/dist/prompts/steward-recovery.md +113 -0
  22. package/dist/prompts/worker.md +12 -1
  23. package/dist/providers/claude/headless.d.ts +2 -0
  24. package/dist/providers/claude/headless.d.ts.map +1 -1
  25. package/dist/providers/claude/headless.js +8 -0
  26. package/dist/providers/claude/headless.js.map +1 -1
  27. package/dist/providers/claude/index.js +1 -1
  28. package/dist/providers/claude/index.js.map +1 -1
  29. package/dist/runtime/session-manager.d.ts +22 -2
  30. package/dist/runtime/session-manager.d.ts.map +1 -1
  31. package/dist/runtime/session-manager.js +74 -16
  32. package/dist/runtime/session-manager.js.map +1 -1
  33. package/dist/runtime/spawner.d.ts.map +1 -1
  34. package/dist/runtime/spawner.js +10 -0
  35. package/dist/runtime/spawner.js.map +1 -1
  36. package/dist/server/config.d.ts.map +1 -1
  37. package/dist/server/config.js +3 -1
  38. package/dist/server/config.js.map +1 -1
  39. package/dist/server/daemon-state.d.ts.map +1 -1
  40. package/dist/server/daemon-state.js +5 -3
  41. package/dist/server/daemon-state.js.map +1 -1
  42. package/dist/server/events-websocket.d.ts.map +1 -1
  43. package/dist/server/events-websocket.js +7 -5
  44. package/dist/server/events-websocket.js.map +1 -1
  45. package/dist/server/formatters.d.ts +16 -2
  46. package/dist/server/formatters.d.ts.map +1 -1
  47. package/dist/server/formatters.js +23 -2
  48. package/dist/server/formatters.js.map +1 -1
  49. package/dist/server/index.d.ts.map +1 -1
  50. package/dist/server/index.js +16 -12
  51. package/dist/server/index.js.map +1 -1
  52. package/dist/server/lsp-websocket.d.ts.map +1 -1
  53. package/dist/server/lsp-websocket.js +10 -8
  54. package/dist/server/lsp-websocket.js.map +1 -1
  55. package/dist/server/routes/agents.d.ts.map +1 -1
  56. package/dist/server/routes/agents.js +81 -15
  57. package/dist/server/routes/agents.js.map +1 -1
  58. package/dist/server/routes/daemon.d.ts.map +1 -1
  59. package/dist/server/routes/daemon.js +6 -4
  60. package/dist/server/routes/daemon.js.map +1 -1
  61. package/dist/server/routes/events.d.ts.map +1 -1
  62. package/dist/server/routes/events.js +4 -2
  63. package/dist/server/routes/events.js.map +1 -1
  64. package/dist/server/routes/extensions.d.ts.map +1 -1
  65. package/dist/server/routes/extensions.js +9 -7
  66. package/dist/server/routes/extensions.js.map +1 -1
  67. package/dist/server/routes/index.d.ts +1 -0
  68. package/dist/server/routes/index.d.ts.map +1 -1
  69. package/dist/server/routes/index.js +1 -0
  70. package/dist/server/routes/index.js.map +1 -1
  71. package/dist/server/routes/plugins.d.ts.map +1 -1
  72. package/dist/server/routes/plugins.js +6 -4
  73. package/dist/server/routes/plugins.js.map +1 -1
  74. package/dist/server/routes/pools.d.ts.map +1 -1
  75. package/dist/server/routes/pools.js +26 -7
  76. package/dist/server/routes/pools.js.map +1 -1
  77. package/dist/server/routes/scheduler.d.ts.map +1 -1
  78. package/dist/server/routes/scheduler.js +9 -7
  79. package/dist/server/routes/scheduler.js.map +1 -1
  80. package/dist/server/routes/sessions.d.ts.map +1 -1
  81. package/dist/server/routes/sessions.js +17 -15
  82. package/dist/server/routes/sessions.js.map +1 -1
  83. package/dist/server/routes/settings.d.ts +10 -0
  84. package/dist/server/routes/settings.d.ts.map +1 -0
  85. package/dist/server/routes/settings.js +65 -0
  86. package/dist/server/routes/settings.js.map +1 -0
  87. package/dist/server/routes/tasks.d.ts.map +1 -1
  88. package/dist/server/routes/tasks.js +54 -31
  89. package/dist/server/routes/tasks.js.map +1 -1
  90. package/dist/server/routes/upload.d.ts.map +1 -1
  91. package/dist/server/routes/upload.js +3 -1
  92. package/dist/server/routes/upload.js.map +1 -1
  93. package/dist/server/routes/workflows.d.ts.map +1 -1
  94. package/dist/server/routes/workflows.js +17 -15
  95. package/dist/server/routes/workflows.js.map +1 -1
  96. package/dist/server/routes/workspace-files.d.ts.map +1 -1
  97. package/dist/server/routes/workspace-files.js +11 -9
  98. package/dist/server/routes/workspace-files.js.map +1 -1
  99. package/dist/server/routes/worktrees.d.ts.map +1 -1
  100. package/dist/server/routes/worktrees.js +6 -4
  101. package/dist/server/routes/worktrees.js.map +1 -1
  102. package/dist/server/server.d.ts.map +1 -1
  103. package/dist/server/server.js +10 -8
  104. package/dist/server/server.js.map +1 -1
  105. package/dist/server/services/lsp-manager.d.ts.map +1 -1
  106. package/dist/server/services/lsp-manager.js +15 -13
  107. package/dist/server/services/lsp-manager.js.map +1 -1
  108. package/dist/server/services.d.ts +2 -2
  109. package/dist/server/services.d.ts.map +1 -1
  110. package/dist/server/services.js +41 -13
  111. package/dist/server/services.js.map +1 -1
  112. package/dist/server/static.d.ts.map +1 -1
  113. package/dist/server/static.js +3 -1
  114. package/dist/server/static.js.map +1 -1
  115. package/dist/server/websocket.d.ts.map +1 -1
  116. package/dist/server/websocket.js +6 -4
  117. package/dist/server/websocket.js.map +1 -1
  118. package/dist/services/agent-pool-service.d.ts.map +1 -1
  119. package/dist/services/agent-pool-service.js +7 -8
  120. package/dist/services/agent-pool-service.js.map +1 -1
  121. package/dist/services/agent-registry.d.ts +8 -1
  122. package/dist/services/agent-registry.d.ts.map +1 -1
  123. package/dist/services/agent-registry.js +27 -4
  124. package/dist/services/agent-registry.js.map +1 -1
  125. package/dist/services/dispatch-daemon.d.ts +64 -2
  126. package/dist/services/dispatch-daemon.d.ts.map +1 -1
  127. package/dist/services/dispatch-daemon.js +387 -59
  128. package/dist/services/dispatch-daemon.js.map +1 -1
  129. package/dist/services/index.d.ts +1 -2
  130. package/dist/services/index.d.ts.map +1 -1
  131. package/dist/services/index.js +6 -11
  132. package/dist/services/index.js.map +1 -1
  133. package/dist/services/merge-steward-service.d.ts.map +1 -1
  134. package/dist/services/merge-steward-service.js +6 -4
  135. package/dist/services/merge-steward-service.js.map +1 -1
  136. package/dist/services/settings-service.d.ts +56 -0
  137. package/dist/services/settings-service.d.ts.map +1 -0
  138. package/dist/services/settings-service.js +92 -0
  139. package/dist/services/settings-service.js.map +1 -0
  140. package/dist/services/steward-scheduler.d.ts +37 -5
  141. package/dist/services/steward-scheduler.d.ts.map +1 -1
  142. package/dist/services/steward-scheduler.js +224 -41
  143. package/dist/services/steward-scheduler.js.map +1 -1
  144. package/dist/services/task-assignment-service.d.ts.map +1 -1
  145. package/dist/services/task-assignment-service.js +3 -0
  146. package/dist/services/task-assignment-service.js.map +1 -1
  147. package/dist/testing/test-context.d.ts +1 -1
  148. package/dist/testing/test-context.d.ts.map +1 -1
  149. package/dist/types/agent-pool.d.ts +4 -0
  150. package/dist/types/agent-pool.d.ts.map +1 -1
  151. package/dist/types/agent-pool.js +8 -1
  152. package/dist/types/agent-pool.js.map +1 -1
  153. package/dist/types/agent.d.ts +37 -7
  154. package/dist/types/agent.d.ts.map +1 -1
  155. package/dist/types/agent.js +2 -2
  156. package/dist/types/agent.js.map +1 -1
  157. package/dist/types/role-definition.d.ts +1 -1
  158. package/dist/types/role-definition.js +1 -1
  159. package/dist/types/role-definition.js.map +1 -1
  160. package/dist/types/task-meta.d.ts +6 -0
  161. package/dist/types/task-meta.d.ts.map +1 -1
  162. package/dist/types/task-meta.js.map +1 -1
  163. package/dist/utils/logger.d.ts +66 -0
  164. package/dist/utils/logger.d.ts.map +1 -0
  165. package/dist/utils/logger.js +133 -0
  166. package/dist/utils/logger.js.map +1 -0
  167. package/package.json +7 -7
  168. package/web/assets/{index-R1cylSgw.js → index-8dBly5AJ.js} +697 -437
  169. package/web/assets/index-CNcjZKzg.css +32 -0
  170. package/web/assets/{utils-vendor-DaJ2Dubl.js → utils-vendor-B7jOGaxP.js} +1 -1
  171. package/web/index.html +3 -3
  172. package/dist/prompts/steward-health.md +0 -39
  173. package/dist/prompts/steward-ops.md +0 -28
  174. package/dist/prompts/steward-reminder.md +0 -26
  175. package/dist/services/health-steward-service.d.ts +0 -446
  176. package/dist/services/health-steward-service.d.ts.map +0 -1
  177. package/dist/services/health-steward-service.js +0 -866
  178. package/dist/services/health-steward-service.js.map +0 -1
  179. package/web/assets/index-DqP-_E4F.css +0 -32
@@ -1,866 +0,0 @@
1
- /**
2
- * Health Steward Service
3
- *
4
- * This service monitors agent health and detects stuck agents.
5
- * The Health Steward detects problematic agents and takes corrective action.
6
- *
7
- * Key features:
8
- * - Detect agents with no output for configurable duration
9
- * - Detect agents with repeated errors
10
- * - Detect crashed agent processes
11
- * - Attempt to unstick agents
12
- * - Notify Director of issues
13
- * - Stop and reassign tasks from stuck agents
14
- *
15
- * TB-O24: Health Steward Implementation
16
- *
17
- * @module
18
- */
19
- import { EventEmitter } from 'node:events';
20
- import { createTimestamp, TaskStatus } from '@stoneforge/core';
21
- import { getAgentMetadata } from './agent-registry.js';
22
- /**
23
- * All valid health issue types
24
- */
25
- export const HealthIssueTypes = [
26
- 'no_output',
27
- 'repeated_errors',
28
- 'process_crashed',
29
- 'high_error_rate',
30
- 'session_stale',
31
- 'unresponsive',
32
- ];
33
- /**
34
- * Type guard for HealthIssueType
35
- */
36
- export function isHealthIssueType(value) {
37
- return typeof value === 'string' && HealthIssueTypes.includes(value);
38
- }
39
- /**
40
- * All valid severity levels
41
- */
42
- export const HealthIssueSeverities = ['warning', 'error', 'critical'];
43
- /**
44
- * Type guard for HealthIssueSeverity
45
- */
46
- export function isHealthIssueSeverity(value) {
47
- return typeof value === 'string' && HealthIssueSeverities.includes(value);
48
- }
49
- /**
50
- * All valid health actions
51
- */
52
- export const HealthActions = [
53
- 'monitor',
54
- 'send_ping',
55
- 'restart',
56
- 'notify_director',
57
- 'reassign_task',
58
- 'escalate',
59
- ];
60
- /**
61
- * Type guard for HealthAction
62
- */
63
- export function isHealthAction(value) {
64
- return typeof value === 'string' && HealthActions.includes(value);
65
- }
66
- // ============================================================================
67
- // Health Steward Service Implementation
68
- // ============================================================================
69
- /**
70
- * Default configuration values
71
- */
72
- const DEFAULT_CONFIG = {
73
- noOutputThresholdMs: 5 * 60 * 1000, // 5 minutes
74
- errorCountThreshold: 5,
75
- errorWindowMs: 10 * 60 * 1000, // 10 minutes
76
- staleSessionThresholdMs: 15 * 60 * 1000, // 15 minutes
77
- healthCheckIntervalMs: 60 * 1000, // 1 minute
78
- maxPingAttempts: 3,
79
- autoRestart: true,
80
- autoReassign: true,
81
- notifyDirector: true,
82
- maxIssuesPerAgent: 10,
83
- };
84
- /**
85
- * Implementation of the Health Steward Service.
86
- */
87
- export class HealthStewardServiceImpl {
88
- // Reserved for future persistence of health data
89
- _api;
90
- config;
91
- agentRegistry;
92
- sessionManager;
93
- taskAssignment;
94
- dispatchService;
95
- emitter;
96
- // State
97
- running = false;
98
- checkInterval;
99
- issueCounter = 0;
100
- // Tracking
101
- activityTrackers = new Map();
102
- activeIssues = new Map();
103
- resolvedIssueIds = new Set();
104
- // Statistics
105
- totalChecks = 0;
106
- totalIssuesDetected = 0;
107
- totalIssuesResolved = 0;
108
- totalActionsTaken = 0;
109
- successfulActions = 0;
110
- failedActions = 0;
111
- constructor(api, agentRegistry, sessionManager, taskAssignment, dispatchService, config = {}) {
112
- this._api = api;
113
- this.agentRegistry = agentRegistry;
114
- this.sessionManager = sessionManager;
115
- this.taskAssignment = taskAssignment;
116
- this.dispatchService = dispatchService;
117
- this.config = { ...DEFAULT_CONFIG, ...config };
118
- this.emitter = new EventEmitter();
119
- }
120
- // ----------------------------------------
121
- // Health Checks
122
- // ----------------------------------------
123
- async runHealthCheck() {
124
- const startTime = Date.now();
125
- const timestamp = createTimestamp();
126
- const newIssues = [];
127
- const resolvedIssues = [];
128
- const actionsTaken = [];
129
- // Get all agents with running sessions
130
- const agents = await this.getRunningAgents();
131
- let agentsWithIssues = 0;
132
- for (const agent of agents) {
133
- const agentId = agent.id;
134
- const status = await this.checkAgent(agentId);
135
- if (!status.isHealthy) {
136
- agentsWithIssues++;
137
- // Record new issues
138
- for (const issue of status.issues) {
139
- if (!this.activeIssues.has(issue.id)) {
140
- newIssues.push(issue);
141
- this.emitter.emit('issue:detected', issue);
142
- }
143
- }
144
- // Take automatic actions if configured
145
- for (const issue of status.issues) {
146
- const action = await this.determineAction(issue);
147
- if (action !== 'monitor') {
148
- const result = await this.takeAction(issue.id, action);
149
- actionsTaken.push(result);
150
- }
151
- }
152
- }
153
- }
154
- // Check for resolved issues
155
- for (const [issueId, issue] of this.activeIssues) {
156
- const agent = await this.agentRegistry.getAgent(issue.agentId);
157
- if (!agent) {
158
- // Agent no longer exists - resolve issue
159
- this.resolveIssue(issueId);
160
- resolvedIssues.push(issueId);
161
- continue;
162
- }
163
- // Check if the issue condition still applies
164
- const stillApplies = await this.issueStillApplies(issue);
165
- if (!stillApplies) {
166
- this.resolveIssue(issueId);
167
- resolvedIssues.push(issueId);
168
- }
169
- }
170
- this.totalChecks++;
171
- const result = {
172
- timestamp,
173
- agentsChecked: agents.length,
174
- agentsWithIssues,
175
- newIssues,
176
- resolvedIssues,
177
- actionsTaken,
178
- durationMs: Date.now() - startTime,
179
- };
180
- this.emitter.emit('check:completed', result);
181
- return result;
182
- }
183
- async checkAgent(agentId) {
184
- const agent = await this.agentRegistry.getAgent(agentId);
185
- if (!agent) {
186
- return this.createUnhealthyStatus(agentId, 'unknown', 'worker', []);
187
- }
188
- const meta = getAgentMetadata(agent);
189
- if (!meta) {
190
- return this.createUnhealthyStatus(agentId, agent.name, 'worker', []);
191
- }
192
- const tracker = this.getOrCreateTracker(agentId);
193
- tracker.lastHealthCheckAt = createTimestamp();
194
- const issues = [];
195
- const now = Date.now();
196
- // Get session info
197
- const session = this.sessionManager.getActiveSession(agentId);
198
- const sessionStatus = session?.status ?? meta.sessionStatus ?? 'idle';
199
- // Only check running sessions
200
- if (sessionStatus === 'running') {
201
- // Check for no output
202
- if (tracker.lastOutputAt) {
203
- const lastOutputTime = this.getTimestampMs(tracker.lastOutputAt);
204
- const timeSinceOutput = now - lastOutputTime;
205
- if (timeSinceOutput > this.config.noOutputThresholdMs) {
206
- issues.push(this.createIssue(agentId, agent.name, meta.agentRole, 'no_output', {
207
- timeSinceOutputMs: timeSinceOutput,
208
- thresholdMs: this.config.noOutputThresholdMs,
209
- sessionId: session?.id,
210
- }));
211
- }
212
- }
213
- // Check for repeated errors
214
- const recentErrors = this.countRecentEvents(tracker.errorTimestamps, this.config.errorWindowMs);
215
- if (recentErrors >= this.config.errorCountThreshold) {
216
- issues.push(this.createIssue(agentId, agent.name, meta.agentRole, 'repeated_errors', {
217
- errorCount: recentErrors,
218
- thresholdCount: this.config.errorCountThreshold,
219
- windowMs: this.config.errorWindowMs,
220
- sessionId: session?.id,
221
- }));
222
- }
223
- // Check for high error rate
224
- const recentOutputs = this.countRecentEvents(tracker.outputTimestamps, this.config.errorWindowMs);
225
- if (recentOutputs > 0 && recentErrors > 0) {
226
- const errorRate = recentErrors / (recentErrors + recentOutputs);
227
- if (errorRate > 0.5) { // More than 50% errors
228
- issues.push(this.createIssue(agentId, agent.name, meta.agentRole, 'high_error_rate', {
229
- errorRate,
230
- errorCount: recentErrors,
231
- outputCount: recentOutputs,
232
- sessionId: session?.id,
233
- }));
234
- }
235
- }
236
- // Check for stale session
237
- if (session?.lastActivityAt) {
238
- const lastActivityTime = this.getTimestampMs(session.lastActivityAt);
239
- const timeSinceActivity = now - lastActivityTime;
240
- if (timeSinceActivity > this.config.staleSessionThresholdMs) {
241
- issues.push(this.createIssue(agentId, agent.name, meta.agentRole, 'session_stale', {
242
- timeSinceActivityMs: timeSinceActivity,
243
- thresholdMs: this.config.staleSessionThresholdMs,
244
- sessionId: session.id,
245
- }));
246
- }
247
- }
248
- }
249
- // Get current task
250
- const agentTasks = await this.taskAssignment.getAgentTasks(agentId, {
251
- taskStatus: TaskStatus.IN_PROGRESS,
252
- });
253
- const currentTaskId = agentTasks[0]?.taskId;
254
- // Add task context to issues
255
- for (const issue of issues) {
256
- if (currentTaskId) {
257
- issue.taskId = currentTaskId;
258
- }
259
- }
260
- // Update active issues
261
- for (const issue of issues) {
262
- const existingIssue = this.findExistingIssue(agentId, issue.issueType);
263
- if (existingIssue) {
264
- // Update existing issue
265
- this.activeIssues.set(existingIssue.id, {
266
- ...existingIssue,
267
- lastSeenAt: createTimestamp(),
268
- occurrenceCount: existingIssue.occurrenceCount + 1,
269
- context: issue.context,
270
- });
271
- }
272
- else {
273
- // New issue
274
- this.activeIssues.set(issue.id, issue);
275
- this.totalIssuesDetected++;
276
- }
277
- }
278
- const recentErrors = this.countRecentEvents(tracker.errorTimestamps, this.config.errorWindowMs);
279
- const recentOutputs = this.countRecentEvents(tracker.outputTimestamps, this.config.errorWindowMs);
280
- return {
281
- agentId,
282
- agentName: agent.name,
283
- agentRole: meta.agentRole,
284
- isHealthy: issues.length === 0,
285
- issues,
286
- lastActivityAt: tracker.lastOutputAt ?? meta.lastActivityAt,
287
- lastHealthCheckAt: tracker.lastHealthCheckAt,
288
- sessionStatus,
289
- currentTaskId,
290
- recentErrorCount: recentErrors,
291
- recentOutputCount: recentOutputs,
292
- };
293
- }
294
- async getAllAgentHealth() {
295
- const agents = await this.getRunningAgents();
296
- const statuses = [];
297
- for (const agent of agents) {
298
- const agentId = agent.id;
299
- const status = await this.checkAgent(agentId);
300
- statuses.push(status);
301
- }
302
- return statuses;
303
- }
304
- // ----------------------------------------
305
- // Issue Management
306
- // ----------------------------------------
307
- getActiveIssues() {
308
- return Array.from(this.activeIssues.values());
309
- }
310
- getIssuesForAgent(agentId) {
311
- return Array.from(this.activeIssues.values()).filter((issue) => issue.agentId === agentId);
312
- }
313
- resolveIssue(issueId) {
314
- const issue = this.activeIssues.get(issueId);
315
- if (!issue) {
316
- return false;
317
- }
318
- this.activeIssues.delete(issueId);
319
- this.resolvedIssueIds.add(issueId);
320
- this.totalIssuesResolved++;
321
- this.emitter.emit('issue:resolved', issueId);
322
- return true;
323
- }
324
- clearResolvedIssues() {
325
- this.resolvedIssueIds.clear();
326
- }
327
- // ----------------------------------------
328
- // Actions
329
- // ----------------------------------------
330
- async takeAction(issueId, action) {
331
- const issue = this.activeIssues.get(issueId);
332
- if (!issue) {
333
- return {
334
- success: false,
335
- action: action ?? 'monitor',
336
- issueId,
337
- message: `Issue not found: ${issueId}`,
338
- actionTakenAt: createTimestamp(),
339
- error: 'Issue not found',
340
- };
341
- }
342
- const effectiveAction = action ?? await this.determineAction(issue);
343
- this.totalActionsTaken++;
344
- let result;
345
- try {
346
- switch (effectiveAction) {
347
- case 'monitor':
348
- result = {
349
- success: true,
350
- action: effectiveAction,
351
- issueId,
352
- message: `Continuing to monitor agent ${issue.agentName}`,
353
- actionTakenAt: createTimestamp(),
354
- };
355
- break;
356
- case 'send_ping':
357
- const pingSuccess = await this.pingAgent(issue.agentId);
358
- result = {
359
- success: pingSuccess,
360
- action: effectiveAction,
361
- issueId,
362
- message: pingSuccess
363
- ? `Agent ${issue.agentName} responded to ping`
364
- : `Agent ${issue.agentName} did not respond to ping`,
365
- actionTakenAt: createTimestamp(),
366
- };
367
- break;
368
- case 'restart':
369
- const restartSuccess = await this.restartAgent(issue.agentId);
370
- result = {
371
- success: restartSuccess,
372
- action: effectiveAction,
373
- issueId,
374
- message: restartSuccess
375
- ? `Successfully restarted agent ${issue.agentName}`
376
- : `Failed to restart agent ${issue.agentName}`,
377
- actionTakenAt: createTimestamp(),
378
- };
379
- if (restartSuccess) {
380
- this.resolveIssue(issueId);
381
- }
382
- break;
383
- case 'notify_director':
384
- const notifySuccess = await this.notifyDirector(issue);
385
- result = {
386
- success: notifySuccess,
387
- action: effectiveAction,
388
- issueId,
389
- message: notifySuccess
390
- ? `Director notified about ${issue.agentName}`
391
- : `Failed to notify Director about ${issue.agentName}`,
392
- actionTakenAt: createTimestamp(),
393
- };
394
- break;
395
- case 'reassign_task':
396
- if (!issue.taskId) {
397
- result = {
398
- success: false,
399
- action: effectiveAction,
400
- issueId,
401
- message: `No task to reassign for agent ${issue.agentName}`,
402
- actionTakenAt: createTimestamp(),
403
- error: 'No task associated with issue',
404
- };
405
- }
406
- else {
407
- const reassignResult = await this.reassignTask(issue.agentId, issue.taskId);
408
- result = {
409
- success: reassignResult.success,
410
- action: effectiveAction,
411
- issueId,
412
- message: reassignResult.success
413
- ? `Task reassigned from ${issue.agentName} to new agent`
414
- : `Failed to reassign task from ${issue.agentName}`,
415
- actionTakenAt: createTimestamp(),
416
- newAgentId: reassignResult.newAgentId,
417
- error: reassignResult.error,
418
- };
419
- if (reassignResult.success) {
420
- this.resolveIssue(issueId);
421
- }
422
- }
423
- break;
424
- case 'escalate':
425
- // For escalate, we notify both Director and mark for human review
426
- await this.notifyDirector(issue);
427
- result = {
428
- success: true,
429
- action: effectiveAction,
430
- issueId,
431
- message: `Issue escalated for agent ${issue.agentName} - requires human intervention`,
432
- actionTakenAt: createTimestamp(),
433
- };
434
- break;
435
- default:
436
- result = {
437
- success: false,
438
- action: effectiveAction,
439
- issueId,
440
- message: `Unknown action: ${effectiveAction}`,
441
- actionTakenAt: createTimestamp(),
442
- error: `Unknown action: ${effectiveAction}`,
443
- };
444
- }
445
- }
446
- catch (error) {
447
- const errorMessage = error instanceof Error ? error.message : String(error);
448
- result = {
449
- success: false,
450
- action: effectiveAction,
451
- issueId,
452
- message: `Action failed: ${errorMessage}`,
453
- actionTakenAt: createTimestamp(),
454
- error: errorMessage,
455
- };
456
- }
457
- if (result.success) {
458
- this.successfulActions++;
459
- }
460
- else {
461
- this.failedActions++;
462
- }
463
- this.emitter.emit('action:taken', result);
464
- return result;
465
- }
466
- async pingAgent(agentId) {
467
- const tracker = this.getOrCreateTracker(agentId);
468
- // Check if we've exceeded ping attempts
469
- if (tracker.pingAttempts >= this.config.maxPingAttempts) {
470
- return false;
471
- }
472
- const session = this.sessionManager.getActiveSession(agentId);
473
- if (!session || session.status !== 'running') {
474
- return false;
475
- }
476
- tracker.lastPingAt = createTimestamp();
477
- tracker.pingAttempts++;
478
- // Send a health check message via the session
479
- const result = await this.sessionManager.messageSession(session.id, {
480
- content: 'Health check ping - please respond with any message to confirm you are active.',
481
- });
482
- return result.success;
483
- }
484
- async restartAgent(agentId) {
485
- const session = this.sessionManager.getActiveSession(agentId);
486
- if (session) {
487
- try {
488
- // Stop the current session
489
- await this.sessionManager.stopSession(session.id, {
490
- graceful: true,
491
- reason: 'Health Steward restart due to detected issues',
492
- });
493
- }
494
- catch {
495
- // Continue even if stop fails
496
- }
497
- }
498
- // Reset tracker
499
- const tracker = this.getOrCreateTracker(agentId);
500
- tracker.pingAttempts = 0;
501
- tracker.errorTimestamps = [];
502
- // Note: We don't auto-start a new session here - that's the responsibility
503
- // of the agent or Director. We've cleared the issues by stopping.
504
- return true;
505
- }
506
- async notifyDirector(issue) {
507
- // Find the Director agent
508
- const directors = await this.agentRegistry.getAgentsByRole('director');
509
- if (directors.length === 0) {
510
- return false;
511
- }
512
- const director = directors[0];
513
- const directorId = director.id;
514
- // Build notification content
515
- const severity = issue.severity.toUpperCase();
516
- const content = [
517
- `# Health Alert: ${severity}`,
518
- '',
519
- `**Agent:** ${issue.agentName} (${issue.agentRole})`,
520
- `**Issue:** ${issue.issueType}`,
521
- `**Description:** ${issue.description}`,
522
- '',
523
- `**Detected:** ${issue.detectedAt}`,
524
- `**Occurrences:** ${issue.occurrenceCount}`,
525
- '',
526
- issue.taskId ? `**Current Task:** ${issue.taskId}` : '',
527
- '',
528
- '**Recommended Action:**',
529
- this.getRecommendedActionDescription(issue),
530
- ].filter(Boolean).join('\n');
531
- try {
532
- // Use task-assignment type as a general notification mechanism
533
- await this.dispatchService.notifyAgent(directorId, 'task-assignment', content, {
534
- healthAlert: true,
535
- issueId: issue.id,
536
- issueType: issue.issueType,
537
- severity: issue.severity,
538
- agentId: issue.agentId,
539
- agentName: issue.agentName,
540
- taskId: issue.taskId,
541
- });
542
- return true;
543
- }
544
- catch {
545
- return false;
546
- }
547
- }
548
- async reassignTask(agentId, taskId) {
549
- try {
550
- // Stop the current agent session
551
- const session = this.sessionManager.getActiveSession(agentId);
552
- if (session) {
553
- await this.sessionManager.stopSession(session.id, {
554
- graceful: false,
555
- reason: 'Task reassignment due to agent health issues',
556
- });
557
- }
558
- // Unassign the task - it will be picked up by the dispatch daemon
559
- await this.taskAssignment.unassignTask(taskId);
560
- // Task reassignment is now handled by the dispatch daemon polling for
561
- // unassigned tasks, rather than immediately dispatching here
562
- return {
563
- success: true,
564
- // Note: newAgentId is undefined since dispatch daemon handles assignment
565
- };
566
- }
567
- catch (error) {
568
- const errorMessage = error instanceof Error ? error.message : String(error);
569
- return {
570
- success: false,
571
- error: errorMessage,
572
- };
573
- }
574
- }
575
- // ----------------------------------------
576
- // Activity Tracking
577
- // ----------------------------------------
578
- recordOutput(agentId) {
579
- const tracker = this.getOrCreateTracker(agentId);
580
- const now = createTimestamp();
581
- tracker.lastOutputAt = now;
582
- tracker.outputTimestamps.push(now);
583
- // Reset ping attempts on successful output
584
- tracker.pingAttempts = 0;
585
- // Prune old timestamps
586
- this.pruneTimestamps(tracker);
587
- }
588
- recordError(agentId, _error) {
589
- const tracker = this.getOrCreateTracker(agentId);
590
- const now = createTimestamp();
591
- tracker.lastErrorAt = now;
592
- tracker.errorTimestamps.push(now);
593
- // Prune old timestamps
594
- this.pruneTimestamps(tracker);
595
- }
596
- recordCrash(agentId, exitCode) {
597
- const agent = this.activityTrackers.get(agentId);
598
- const agentName = agent ? 'Agent' : 'Unknown';
599
- // Create a crash issue immediately
600
- const issue = this.createIssue(agentId, agentName, 'worker', // Default, will be updated
601
- 'process_crashed', { exitCode });
602
- this.activeIssues.set(issue.id, issue);
603
- this.totalIssuesDetected++;
604
- this.emitter.emit('issue:detected', issue);
605
- }
606
- // ----------------------------------------
607
- // Lifecycle
608
- // ----------------------------------------
609
- start() {
610
- if (this.running) {
611
- return;
612
- }
613
- this.running = true;
614
- this.checkInterval = setInterval(() => this.runHealthCheck().catch(() => { }), this.config.healthCheckIntervalMs);
615
- }
616
- stop() {
617
- if (!this.running) {
618
- return;
619
- }
620
- this.running = false;
621
- if (this.checkInterval) {
622
- clearInterval(this.checkInterval);
623
- this.checkInterval = undefined;
624
- }
625
- }
626
- isRunning() {
627
- return this.running;
628
- }
629
- // ----------------------------------------
630
- // Statistics
631
- // ----------------------------------------
632
- getStats() {
633
- return {
634
- totalChecks: this.totalChecks,
635
- totalIssuesDetected: this.totalIssuesDetected,
636
- totalIssuesResolved: this.totalIssuesResolved,
637
- totalActionsTaken: this.totalActionsTaken,
638
- successfulActions: this.successfulActions,
639
- failedActions: this.failedActions,
640
- activeIssues: this.activeIssues.size,
641
- monitoredAgents: this.activityTrackers.size,
642
- };
643
- }
644
- on(event, listener) {
645
- this.emitter.on(event, listener);
646
- }
647
- off(event, listener) {
648
- this.emitter.off(event, listener);
649
- }
650
- // ----------------------------------------
651
- // Private Helpers
652
- // ----------------------------------------
653
- async getRunningAgents() {
654
- const allAgents = await this.agentRegistry.listAgents({
655
- sessionStatus: 'running',
656
- });
657
- return allAgents;
658
- }
659
- getOrCreateTracker(agentId) {
660
- let tracker = this.activityTrackers.get(agentId);
661
- if (!tracker) {
662
- tracker = {
663
- agentId,
664
- errorTimestamps: [],
665
- outputTimestamps: [],
666
- pingAttempts: 0,
667
- };
668
- this.activityTrackers.set(agentId, tracker);
669
- }
670
- return tracker;
671
- }
672
- createIssue(agentId, agentName, agentRole, issueType, context) {
673
- const now = createTimestamp();
674
- const severity = this.determineSeverity(issueType, context);
675
- return {
676
- id: `health-${++this.issueCounter}-${Date.now()}`,
677
- agentId,
678
- agentName,
679
- agentRole,
680
- issueType,
681
- severity,
682
- description: this.getIssueDescription(issueType, context),
683
- detectedAt: now,
684
- lastSeenAt: now,
685
- occurrenceCount: 1,
686
- sessionId: context?.sessionId,
687
- context,
688
- };
689
- }
690
- createUnhealthyStatus(agentId, agentName, agentRole, issues) {
691
- return {
692
- agentId,
693
- agentName,
694
- agentRole,
695
- isHealthy: false,
696
- issues,
697
- recentErrorCount: 0,
698
- recentOutputCount: 0,
699
- };
700
- }
701
- determineSeverity(issueType, context) {
702
- switch (issueType) {
703
- case 'process_crashed':
704
- return 'critical';
705
- case 'repeated_errors':
706
- const errorCount = context?.errorCount ?? 0;
707
- return errorCount > 10 ? 'critical' : 'error';
708
- case 'high_error_rate':
709
- return 'error';
710
- case 'no_output':
711
- const timeSinceOutput = context?.timeSinceOutputMs ?? 0;
712
- return timeSinceOutput > 15 * 60 * 1000 ? 'error' : 'warning';
713
- case 'session_stale':
714
- return 'warning';
715
- case 'unresponsive':
716
- return 'error';
717
- default:
718
- return 'warning';
719
- }
720
- }
721
- getIssueDescription(issueType, context) {
722
- switch (issueType) {
723
- case 'no_output':
724
- const mins = Math.round((context?.timeSinceOutputMs ?? 0) / 60000);
725
- return `Agent has produced no output for ${mins} minutes`;
726
- case 'repeated_errors':
727
- return `Agent has encountered ${context?.errorCount ?? 'multiple'} errors in the last ${Math.round((context?.windowMs ?? 600000) / 60000)} minutes`;
728
- case 'process_crashed':
729
- return `Agent process crashed with exit code ${context?.exitCode ?? 'unknown'}`;
730
- case 'high_error_rate':
731
- const rate = (context?.errorRate ?? 0) * 100;
732
- return `Agent has a ${rate.toFixed(1)}% error rate`;
733
- case 'session_stale':
734
- const staleMins = Math.round((context?.timeSinceActivityMs ?? 0) / 60000);
735
- return `Agent session has been inactive for ${staleMins} minutes`;
736
- case 'unresponsive':
737
- return 'Agent is not responding to health check pings';
738
- default:
739
- return 'Unknown health issue';
740
- }
741
- }
742
- async determineAction(issue) {
743
- const tracker = this.activityTrackers.get(issue.agentId);
744
- switch (issue.issueType) {
745
- case 'process_crashed':
746
- // For crashes, try to reassign if auto-reassign is enabled
747
- if (this.config.autoReassign && issue.taskId) {
748
- return 'reassign_task';
749
- }
750
- return 'notify_director';
751
- case 'no_output':
752
- case 'session_stale':
753
- // First try pinging
754
- if (tracker && tracker.pingAttempts < this.config.maxPingAttempts) {
755
- return 'send_ping';
756
- }
757
- // Then try restart
758
- if (this.config.autoRestart) {
759
- return 'restart';
760
- }
761
- return 'notify_director';
762
- case 'repeated_errors':
763
- case 'high_error_rate':
764
- // Notify Director about error patterns
765
- if (this.config.notifyDirector) {
766
- return 'notify_director';
767
- }
768
- return 'monitor';
769
- case 'unresponsive':
770
- // Escalate unresponsive agents
771
- if (issue.severity === 'critical') {
772
- return 'escalate';
773
- }
774
- if (this.config.autoRestart) {
775
- return 'restart';
776
- }
777
- return 'notify_director';
778
- default:
779
- return 'monitor';
780
- }
781
- }
782
- getRecommendedActionDescription(issue) {
783
- switch (issue.issueType) {
784
- case 'process_crashed':
785
- return 'Restart the agent and reassign any incomplete tasks.';
786
- case 'no_output':
787
- return 'Check if the agent is stuck and consider restarting.';
788
- case 'repeated_errors':
789
- return 'Review the task requirements and agent logs to identify the root cause.';
790
- case 'high_error_rate':
791
- return 'The agent may be struggling with the current task. Consider reassignment.';
792
- case 'session_stale':
793
- return 'The agent session may be hung. Try restarting the session.';
794
- case 'unresponsive':
795
- return 'The agent is not responding. Force terminate and reassign task.';
796
- default:
797
- return 'Monitor the situation and take action if it persists.';
798
- }
799
- }
800
- findExistingIssue(agentId, issueType) {
801
- for (const issue of this.activeIssues.values()) {
802
- if (issue.agentId === agentId && issue.issueType === issueType) {
803
- return issue;
804
- }
805
- }
806
- return undefined;
807
- }
808
- async issueStillApplies(issue) {
809
- const tracker = this.activityTrackers.get(issue.agentId);
810
- if (!tracker) {
811
- return false;
812
- }
813
- const now = Date.now();
814
- switch (issue.issueType) {
815
- case 'no_output':
816
- if (!tracker.lastOutputAt)
817
- return true;
818
- const timeSinceOutput = now - this.getTimestampMs(tracker.lastOutputAt);
819
- return timeSinceOutput > this.config.noOutputThresholdMs;
820
- case 'repeated_errors':
821
- const recentErrors = this.countRecentEvents(tracker.errorTimestamps, this.config.errorWindowMs);
822
- return recentErrors >= this.config.errorCountThreshold;
823
- case 'process_crashed':
824
- // Crashes are resolved by explicit restart/reassign
825
- return true;
826
- case 'high_error_rate':
827
- const errors = this.countRecentEvents(tracker.errorTimestamps, this.config.errorWindowMs);
828
- const outputs = this.countRecentEvents(tracker.outputTimestamps, this.config.errorWindowMs);
829
- if (outputs === 0)
830
- return false;
831
- return (errors / (errors + outputs)) > 0.5;
832
- case 'session_stale':
833
- const session = this.sessionManager.getActiveSession(issue.agentId);
834
- if (!session?.lastActivityAt)
835
- return false;
836
- const timeSinceActivity = now - this.getTimestampMs(session.lastActivityAt);
837
- return timeSinceActivity > this.config.staleSessionThresholdMs;
838
- default:
839
- return false;
840
- }
841
- }
842
- countRecentEvents(timestamps, windowMs) {
843
- const now = Date.now();
844
- const windowStart = now - windowMs;
845
- return timestamps.filter((t) => this.getTimestampMs(t) >= windowStart).length;
846
- }
847
- pruneTimestamps(tracker) {
848
- const now = Date.now();
849
- const windowStart = now - this.config.errorWindowMs;
850
- tracker.errorTimestamps = tracker.errorTimestamps.filter((t) => this.getTimestampMs(t) >= windowStart);
851
- tracker.outputTimestamps = tracker.outputTimestamps.filter((t) => this.getTimestampMs(t) >= windowStart);
852
- }
853
- getTimestampMs(timestamp) {
854
- return typeof timestamp === 'number' ? timestamp : new Date(timestamp).getTime();
855
- }
856
- }
857
- // ============================================================================
858
- // Factory Function
859
- // ============================================================================
860
- /**
861
- * Creates a HealthStewardService instance
862
- */
863
- export function createHealthStewardService(api, agentRegistry, sessionManager, taskAssignment, dispatchService, config) {
864
- return new HealthStewardServiceImpl(api, agentRegistry, sessionManager, taskAssignment, dispatchService, config);
865
- }
866
- //# sourceMappingURL=health-steward-service.js.map