@agent-relay/daemon 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-manager.d.ts +134 -0
- package/dist/agent-manager.d.ts.map +1 -0
- package/dist/agent-manager.js +578 -0
- package/dist/agent-manager.js.map +1 -0
- package/dist/agent-registry.d.ts +99 -0
- package/dist/agent-registry.d.ts.map +1 -0
- package/dist/agent-registry.js +213 -0
- package/dist/agent-registry.js.map +1 -0
- package/dist/agent-signing.d.ts +158 -0
- package/dist/agent-signing.d.ts.map +1 -0
- package/dist/agent-signing.js +523 -0
- package/dist/agent-signing.js.map +1 -0
- package/dist/api.d.ts +106 -0
- package/dist/api.d.ts.map +1 -0
- package/dist/api.js +876 -0
- package/dist/api.js.map +1 -0
- package/dist/auth.d.ts +94 -0
- package/dist/auth.d.ts.map +1 -0
- package/dist/auth.js +197 -0
- package/dist/auth.js.map +1 -0
- package/dist/channel-membership-store.d.ts +55 -0
- package/dist/channel-membership-store.d.ts.map +1 -0
- package/dist/channel-membership-store.js +176 -0
- package/dist/channel-membership-store.js.map +1 -0
- package/dist/cli-auth.d.ts +89 -0
- package/dist/cli-auth.d.ts.map +1 -0
- package/dist/cli-auth.js +792 -0
- package/dist/cli-auth.js.map +1 -0
- package/dist/cloud-sync.d.ts +150 -0
- package/dist/cloud-sync.d.ts.map +1 -0
- package/dist/cloud-sync.js +446 -0
- package/dist/cloud-sync.js.map +1 -0
- package/dist/connection.d.ts +130 -0
- package/dist/connection.d.ts.map +1 -0
- package/dist/connection.js +438 -0
- package/dist/connection.js.map +1 -0
- package/dist/consensus-integration.d.ts +167 -0
- package/dist/consensus-integration.d.ts.map +1 -0
- package/dist/consensus-integration.js +371 -0
- package/dist/consensus-integration.js.map +1 -0
- package/dist/consensus.d.ts +271 -0
- package/dist/consensus.d.ts.map +1 -0
- package/dist/consensus.js +632 -0
- package/dist/consensus.js.map +1 -0
- package/dist/delivery-tracker.d.ts +34 -0
- package/dist/delivery-tracker.d.ts.map +1 -0
- package/dist/delivery-tracker.js +104 -0
- package/dist/delivery-tracker.js.map +1 -0
- package/dist/enhanced-features.d.ts +118 -0
- package/dist/enhanced-features.d.ts.map +1 -0
- package/dist/enhanced-features.js +176 -0
- package/dist/enhanced-features.js.map +1 -0
- package/dist/index.d.ts +31 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +37 -0
- package/dist/index.js.map +1 -0
- package/dist/migrations/index.d.ts +73 -0
- package/dist/migrations/index.d.ts.map +1 -0
- package/dist/migrations/index.js +241 -0
- package/dist/migrations/index.js.map +1 -0
- package/dist/orchestrator.d.ts +217 -0
- package/dist/orchestrator.d.ts.map +1 -0
- package/dist/orchestrator.js +1143 -0
- package/dist/orchestrator.js.map +1 -0
- package/dist/rate-limiter.d.ts +68 -0
- package/dist/rate-limiter.d.ts.map +1 -0
- package/dist/rate-limiter.js +130 -0
- package/dist/rate-limiter.js.map +1 -0
- package/dist/registry.d.ts +9 -0
- package/dist/registry.d.ts.map +1 -0
- package/dist/registry.js +9 -0
- package/dist/registry.js.map +1 -0
- package/dist/relay-ledger.d.ts +261 -0
- package/dist/relay-ledger.d.ts.map +1 -0
- package/dist/relay-ledger.js +532 -0
- package/dist/relay-ledger.js.map +1 -0
- package/dist/relay-watchdog.d.ts +125 -0
- package/dist/relay-watchdog.d.ts.map +1 -0
- package/dist/relay-watchdog.js +611 -0
- package/dist/relay-watchdog.js.map +1 -0
- package/dist/repo-manager.d.ts +116 -0
- package/dist/repo-manager.d.ts.map +1 -0
- package/dist/repo-manager.js +384 -0
- package/dist/repo-manager.js.map +1 -0
- package/dist/router.d.ts +370 -0
- package/dist/router.d.ts.map +1 -0
- package/dist/router.js +1437 -0
- package/dist/router.js.map +1 -0
- package/dist/server.d.ts +174 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +1001 -0
- package/dist/server.js.map +1 -0
- package/dist/spawn-manager.d.ts +78 -0
- package/dist/spawn-manager.d.ts.map +1 -0
- package/dist/spawn-manager.js +165 -0
- package/dist/spawn-manager.js.map +1 -0
- package/dist/sync-queue.d.ts +116 -0
- package/dist/sync-queue.d.ts.map +1 -0
- package/dist/sync-queue.js +361 -0
- package/dist/sync-queue.js.map +1 -0
- package/dist/types.d.ts +133 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/workspace-manager.d.ts +80 -0
- package/dist/workspace-manager.d.ts.map +1 -0
- package/dist/workspace-manager.js +314 -0
- package/dist/workspace-manager.js.map +1 -0
- package/package.json +52 -0
|
@@ -0,0 +1,1143 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Daemon Orchestrator
|
|
3
|
+
*
|
|
4
|
+
* Manages multiple workspace daemons and provides a unified API for the dashboard.
|
|
5
|
+
* This is the top-level service that runs by default, handling workspace switching
|
|
6
|
+
* and agent management across all connected repositories.
|
|
7
|
+
*/
|
|
8
|
+
import * as http from 'http';
|
|
9
|
+
import * as path from 'path';
|
|
10
|
+
import * as fs from 'fs';
|
|
11
|
+
import { EventEmitter } from 'events';
|
|
12
|
+
import { WebSocketServer, WebSocket } from 'ws';
|
|
13
|
+
import { createLogger, metrics, getSupervisor, getMemoryMonitor, formatBytes, } from '@agent-relay/resiliency';
|
|
14
|
+
import { Daemon } from './server.js';
|
|
15
|
+
import { AgentSpawner } from '@agent-relay/bridge';
|
|
16
|
+
import { getProjectPaths } from '@agent-relay/config';
|
|
17
|
+
const logger = createLogger('orchestrator');
|
|
18
|
+
function generateId() {
|
|
19
|
+
return Math.random().toString(36).substring(2, 15);
|
|
20
|
+
}
|
|
21
|
+
const DEFAULT_CONFIG = {
|
|
22
|
+
port: 3456,
|
|
23
|
+
host: 'localhost',
|
|
24
|
+
dataDir: path.join(process.env.HOME || '', '.agent-relay', 'orchestrator'),
|
|
25
|
+
autoStartDaemons: true,
|
|
26
|
+
};
|
|
27
|
+
const HEARTBEAT_INTERVAL_MS = 10_000;
|
|
28
|
+
const RESOURCE_ALERT_COOLDOWN_MS = 60_000;
|
|
29
|
+
const parsedCpuThreshold = parseFloat(process.env.AGENT_CPU_ALERT_THRESHOLD || '300');
|
|
30
|
+
const CPU_ALERT_THRESHOLD = Number.isFinite(parsedCpuThreshold) ? parsedCpuThreshold : 300;
|
|
31
|
+
export class Orchestrator extends EventEmitter {
|
|
32
|
+
config;
|
|
33
|
+
workspaces = new Map();
|
|
34
|
+
activeWorkspaceId;
|
|
35
|
+
server;
|
|
36
|
+
wss;
|
|
37
|
+
sessions = new Map();
|
|
38
|
+
supervisor = getSupervisor({
|
|
39
|
+
autoRestart: true,
|
|
40
|
+
maxRestarts: 5,
|
|
41
|
+
contextPersistence: { enabled: true, autoInjectOnRestart: true },
|
|
42
|
+
});
|
|
43
|
+
workspacesFile;
|
|
44
|
+
// Track alive status for ping/pong keepalive
|
|
45
|
+
clientAlive = new WeakMap();
|
|
46
|
+
pingInterval;
|
|
47
|
+
heartbeatInterval;
|
|
48
|
+
memoryMonitor = getMemoryMonitor({ checkIntervalMs: 10_000 });
|
|
49
|
+
agentHealth = new Map();
|
|
50
|
+
// Event handler references for cleanup
|
|
51
|
+
memorySampleHandler;
|
|
52
|
+
memoryAlertHandler;
|
|
53
|
+
constructor(config = {}) {
|
|
54
|
+
super();
|
|
55
|
+
this.config = { ...DEFAULT_CONFIG, ...config };
|
|
56
|
+
this.workspacesFile = path.join(this.config.dataDir, 'workspaces.json');
|
|
57
|
+
// Ensure data directory exists
|
|
58
|
+
if (!fs.existsSync(this.config.dataDir)) {
|
|
59
|
+
fs.mkdirSync(this.config.dataDir, { recursive: true });
|
|
60
|
+
}
|
|
61
|
+
// Load existing workspaces
|
|
62
|
+
this.loadWorkspaces();
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Start the orchestrator
|
|
66
|
+
*/
|
|
67
|
+
async start() {
|
|
68
|
+
logger.info('Starting orchestrator', {
|
|
69
|
+
port: this.config.port,
|
|
70
|
+
host: this.config.host,
|
|
71
|
+
});
|
|
72
|
+
// Start supervisor
|
|
73
|
+
this.supervisor.start();
|
|
74
|
+
// Auto-start daemons for workspaces
|
|
75
|
+
if (this.config.autoStartDaemons) {
|
|
76
|
+
for (const [id, workspace] of this.workspaces) {
|
|
77
|
+
if (fs.existsSync(workspace.path)) {
|
|
78
|
+
await this.startWorkspaceDaemon(id);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
// Start HTTP server
|
|
83
|
+
this.server = http.createServer((req, res) => this.handleRequest(req, res));
|
|
84
|
+
// Setup WebSocket
|
|
85
|
+
this.wss = new WebSocketServer({ server: this.server });
|
|
86
|
+
this.wss.on('connection', (ws, req) => this.handleWebSocket(ws, req));
|
|
87
|
+
// Setup ping/pong keepalive (30 second interval)
|
|
88
|
+
this.pingInterval = setInterval(() => {
|
|
89
|
+
this.wss?.clients.forEach((ws) => {
|
|
90
|
+
if (this.clientAlive.get(ws) === false) {
|
|
91
|
+
logger.info('WebSocket client unresponsive, closing');
|
|
92
|
+
ws.terminate();
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
95
|
+
this.clientAlive.set(ws, false);
|
|
96
|
+
ws.ping();
|
|
97
|
+
});
|
|
98
|
+
}, 30000);
|
|
99
|
+
this.startHealthMonitoring();
|
|
100
|
+
return new Promise((resolve) => {
|
|
101
|
+
this.server.listen(this.config.port, this.config.host, () => {
|
|
102
|
+
logger.info('Orchestrator started', {
|
|
103
|
+
url: `http://${this.config.host}:${this.config.port}`,
|
|
104
|
+
});
|
|
105
|
+
resolve();
|
|
106
|
+
});
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Stop the orchestrator
|
|
111
|
+
*/
|
|
112
|
+
async stop() {
|
|
113
|
+
logger.info('Stopping orchestrator');
|
|
114
|
+
// Clear ping interval
|
|
115
|
+
if (this.pingInterval) {
|
|
116
|
+
clearInterval(this.pingInterval);
|
|
117
|
+
this.pingInterval = undefined;
|
|
118
|
+
}
|
|
119
|
+
if (this.heartbeatInterval) {
|
|
120
|
+
clearInterval(this.heartbeatInterval);
|
|
121
|
+
this.heartbeatInterval = undefined;
|
|
122
|
+
}
|
|
123
|
+
// Clean up memory monitor event handlers before stopping
|
|
124
|
+
if (this.memorySampleHandler) {
|
|
125
|
+
this.memoryMonitor.off('sample', this.memorySampleHandler);
|
|
126
|
+
this.memorySampleHandler = undefined;
|
|
127
|
+
}
|
|
128
|
+
if (this.memoryAlertHandler) {
|
|
129
|
+
this.memoryMonitor.off('alert', this.memoryAlertHandler);
|
|
130
|
+
this.memoryAlertHandler = undefined;
|
|
131
|
+
}
|
|
132
|
+
this.memoryMonitor.stop();
|
|
133
|
+
// Stop all workspace daemons
|
|
134
|
+
for (const [id] of this.workspaces) {
|
|
135
|
+
await this.stopWorkspaceDaemon(id);
|
|
136
|
+
}
|
|
137
|
+
// Stop supervisor
|
|
138
|
+
this.supervisor.stop();
|
|
139
|
+
// Close WebSocket connections
|
|
140
|
+
if (this.wss) {
|
|
141
|
+
for (const ws of this.wss.clients) {
|
|
142
|
+
ws.close();
|
|
143
|
+
}
|
|
144
|
+
this.wss.close();
|
|
145
|
+
}
|
|
146
|
+
// Close HTTP server
|
|
147
|
+
if (this.server) {
|
|
148
|
+
return new Promise((resolve) => {
|
|
149
|
+
this.server.close(() => {
|
|
150
|
+
logger.info('Orchestrator stopped');
|
|
151
|
+
resolve();
|
|
152
|
+
});
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
// === Workspace Management ===
|
|
157
|
+
/**
|
|
158
|
+
* Add a workspace
|
|
159
|
+
*/
|
|
160
|
+
addWorkspace(request) {
|
|
161
|
+
const resolvedPath = this.resolvePath(request.path);
|
|
162
|
+
// Check if already exists
|
|
163
|
+
const existing = this.findWorkspaceByPath(resolvedPath);
|
|
164
|
+
if (existing) {
|
|
165
|
+
return existing;
|
|
166
|
+
}
|
|
167
|
+
// Validate path exists
|
|
168
|
+
if (!fs.existsSync(resolvedPath)) {
|
|
169
|
+
throw new Error(`Path does not exist: ${resolvedPath}`);
|
|
170
|
+
}
|
|
171
|
+
const workspace = {
|
|
172
|
+
id: generateId(),
|
|
173
|
+
name: request.name || path.basename(resolvedPath),
|
|
174
|
+
path: resolvedPath,
|
|
175
|
+
status: 'inactive',
|
|
176
|
+
provider: request.provider || this.detectProvider(resolvedPath),
|
|
177
|
+
createdAt: new Date(),
|
|
178
|
+
lastActiveAt: new Date(),
|
|
179
|
+
...this.getGitInfo(resolvedPath),
|
|
180
|
+
};
|
|
181
|
+
this.workspaces.set(workspace.id, workspace);
|
|
182
|
+
this.saveWorkspaces();
|
|
183
|
+
logger.info('Workspace added', { id: workspace.id, name: workspace.name });
|
|
184
|
+
this.broadcastEvent({
|
|
185
|
+
type: 'workspace:added',
|
|
186
|
+
workspaceId: workspace.id,
|
|
187
|
+
data: this.toPublicWorkspace(workspace),
|
|
188
|
+
timestamp: new Date(),
|
|
189
|
+
});
|
|
190
|
+
// Auto-start daemon
|
|
191
|
+
if (this.config.autoStartDaemons) {
|
|
192
|
+
this.startWorkspaceDaemon(workspace.id).catch((err) => {
|
|
193
|
+
logger.error('Failed to start workspace daemon', { id: workspace.id, error: String(err) });
|
|
194
|
+
});
|
|
195
|
+
}
|
|
196
|
+
return this.toPublicWorkspace(workspace);
|
|
197
|
+
}
|
|
198
|
+
/**
|
|
199
|
+
* Remove a workspace
|
|
200
|
+
*/
|
|
201
|
+
async removeWorkspace(workspaceId) {
|
|
202
|
+
const workspace = this.workspaces.get(workspaceId);
|
|
203
|
+
if (!workspace)
|
|
204
|
+
return false;
|
|
205
|
+
// Stop daemon if running
|
|
206
|
+
await this.stopWorkspaceDaemon(workspaceId);
|
|
207
|
+
// Clear active if this was active
|
|
208
|
+
if (this.activeWorkspaceId === workspaceId) {
|
|
209
|
+
this.activeWorkspaceId = undefined;
|
|
210
|
+
}
|
|
211
|
+
this.workspaces.delete(workspaceId);
|
|
212
|
+
this.saveWorkspaces();
|
|
213
|
+
logger.info('Workspace removed', { id: workspaceId });
|
|
214
|
+
this.broadcastEvent({
|
|
215
|
+
type: 'workspace:removed',
|
|
216
|
+
workspaceId,
|
|
217
|
+
data: { id: workspaceId },
|
|
218
|
+
timestamp: new Date(),
|
|
219
|
+
});
|
|
220
|
+
return true;
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* Switch to a workspace
|
|
224
|
+
*/
|
|
225
|
+
async switchWorkspace(workspaceId) {
|
|
226
|
+
const workspace = this.workspaces.get(workspaceId);
|
|
227
|
+
if (!workspace) {
|
|
228
|
+
throw new Error(`Workspace not found: ${workspaceId}`);
|
|
229
|
+
}
|
|
230
|
+
const previousId = this.activeWorkspaceId;
|
|
231
|
+
// Update status
|
|
232
|
+
if (previousId && previousId !== workspaceId) {
|
|
233
|
+
const prev = this.workspaces.get(previousId);
|
|
234
|
+
if (prev) {
|
|
235
|
+
prev.status = 'inactive';
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
workspace.status = 'active';
|
|
239
|
+
workspace.lastActiveAt = new Date();
|
|
240
|
+
this.activeWorkspaceId = workspaceId;
|
|
241
|
+
// Ensure daemon is running
|
|
242
|
+
if (!workspace.daemon?.isRunning) {
|
|
243
|
+
await this.startWorkspaceDaemon(workspaceId);
|
|
244
|
+
}
|
|
245
|
+
this.saveWorkspaces();
|
|
246
|
+
logger.info('Switched workspace', { id: workspaceId, name: workspace.name });
|
|
247
|
+
this.broadcastEvent({
|
|
248
|
+
type: 'workspace:switched',
|
|
249
|
+
workspaceId,
|
|
250
|
+
data: { previousId, currentId: workspaceId },
|
|
251
|
+
timestamp: new Date(),
|
|
252
|
+
});
|
|
253
|
+
return this.toPublicWorkspace(workspace);
|
|
254
|
+
}
|
|
255
|
+
/**
|
|
256
|
+
* Get all workspaces
|
|
257
|
+
*/
|
|
258
|
+
getWorkspaces() {
|
|
259
|
+
return Array.from(this.workspaces.values()).map((w) => this.toPublicWorkspace(w));
|
|
260
|
+
}
|
|
261
|
+
/**
|
|
262
|
+
* Get workspace by ID
|
|
263
|
+
*/
|
|
264
|
+
getWorkspace(workspaceId) {
|
|
265
|
+
const workspace = this.workspaces.get(workspaceId);
|
|
266
|
+
return workspace ? this.toPublicWorkspace(workspace) : undefined;
|
|
267
|
+
}
|
|
268
|
+
/**
|
|
269
|
+
* Get active workspace
|
|
270
|
+
*/
|
|
271
|
+
getActiveWorkspace() {
|
|
272
|
+
if (!this.activeWorkspaceId)
|
|
273
|
+
return undefined;
|
|
274
|
+
return this.getWorkspace(this.activeWorkspaceId);
|
|
275
|
+
}
|
|
276
|
+
// === Agent Management ===
|
|
277
|
+
/**
|
|
278
|
+
* Spawn an agent in a workspace
|
|
279
|
+
*/
|
|
280
|
+
async spawnAgent(workspaceId, request) {
|
|
281
|
+
const workspace = this.workspaces.get(workspaceId);
|
|
282
|
+
if (!workspace) {
|
|
283
|
+
throw new Error(`Workspace not found: ${workspaceId}`);
|
|
284
|
+
}
|
|
285
|
+
// Ensure daemon is running
|
|
286
|
+
if (!workspace.daemon?.isRunning) {
|
|
287
|
+
await this.startWorkspaceDaemon(workspaceId);
|
|
288
|
+
}
|
|
289
|
+
// Ensure spawner exists
|
|
290
|
+
if (!workspace.spawner) {
|
|
291
|
+
workspace.spawner = new AgentSpawner({
|
|
292
|
+
projectRoot: workspace.path,
|
|
293
|
+
onMarkSpawning: (name) => workspace.daemon?.markSpawning(name),
|
|
294
|
+
onClearSpawning: (name) => workspace.daemon?.clearSpawning(name),
|
|
295
|
+
});
|
|
296
|
+
}
|
|
297
|
+
const result = await workspace.spawner.spawn({
|
|
298
|
+
name: request.name,
|
|
299
|
+
cli: this.getCliForProvider(request.provider || workspace.provider),
|
|
300
|
+
task: request.task || '',
|
|
301
|
+
});
|
|
302
|
+
if (!result.success) {
|
|
303
|
+
throw new Error(result.error || 'Failed to spawn agent');
|
|
304
|
+
}
|
|
305
|
+
const agent = {
|
|
306
|
+
id: generateId(),
|
|
307
|
+
name: request.name,
|
|
308
|
+
workspaceId,
|
|
309
|
+
provider: request.provider || workspace.provider,
|
|
310
|
+
status: 'running',
|
|
311
|
+
pid: result.pid,
|
|
312
|
+
task: request.task,
|
|
313
|
+
spawnedAt: new Date(),
|
|
314
|
+
restartCount: 0,
|
|
315
|
+
};
|
|
316
|
+
// Register for health monitoring if we have a PID
|
|
317
|
+
if (result.pid) {
|
|
318
|
+
this.registerAgentHealth(workspaceId, request.name, result.pid);
|
|
319
|
+
}
|
|
320
|
+
else {
|
|
321
|
+
logger.warn('Agent spawned without PID - health monitoring disabled', {
|
|
322
|
+
workspaceId,
|
|
323
|
+
agentName: request.name,
|
|
324
|
+
});
|
|
325
|
+
}
|
|
326
|
+
logger.info('Agent spawned', { id: agent.id, name: agent.name, workspaceId, pid: result.pid });
|
|
327
|
+
this.broadcastEvent({
|
|
328
|
+
type: 'agent:spawned',
|
|
329
|
+
workspaceId,
|
|
330
|
+
agentId: agent.id,
|
|
331
|
+
data: agent,
|
|
332
|
+
timestamp: new Date(),
|
|
333
|
+
});
|
|
334
|
+
return agent;
|
|
335
|
+
}
|
|
336
|
+
/**
|
|
337
|
+
* Stop an agent
|
|
338
|
+
*/
|
|
339
|
+
async stopAgent(workspaceId, agentName) {
|
|
340
|
+
const workspace = this.workspaces.get(workspaceId);
|
|
341
|
+
if (!workspace?.spawner)
|
|
342
|
+
return false;
|
|
343
|
+
// Mark as releasing BEFORE stopping to prevent crash announcement
|
|
344
|
+
this.markAgentReleasing(workspaceId, agentName);
|
|
345
|
+
try {
|
|
346
|
+
const released = await workspace.spawner.release(agentName);
|
|
347
|
+
if (released) {
|
|
348
|
+
// Unregister from health monitoring after successful release
|
|
349
|
+
this.unregisterAgentHealth(workspaceId, agentName);
|
|
350
|
+
this.broadcastEvent({
|
|
351
|
+
type: 'agent:stopped',
|
|
352
|
+
workspaceId,
|
|
353
|
+
data: { name: agentName },
|
|
354
|
+
timestamp: new Date(),
|
|
355
|
+
});
|
|
356
|
+
logger.info('Agent stopped gracefully', { workspaceId, agentName });
|
|
357
|
+
}
|
|
358
|
+
else {
|
|
359
|
+
// Release failed - clear the releasing flag
|
|
360
|
+
const health = this.getAgentHealth(workspaceId, agentName);
|
|
361
|
+
if (health) {
|
|
362
|
+
health.releasing = false;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
return released;
|
|
366
|
+
}
|
|
367
|
+
catch (err) {
|
|
368
|
+
// Release threw an exception - clean up health tracking to avoid stuck state
|
|
369
|
+
this.unregisterAgentHealth(workspaceId, agentName);
|
|
370
|
+
logger.error('Agent release failed with exception', {
|
|
371
|
+
workspaceId,
|
|
372
|
+
agentName,
|
|
373
|
+
error: String(err),
|
|
374
|
+
});
|
|
375
|
+
throw err;
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
/**
|
|
379
|
+
* Get agents in a workspace
|
|
380
|
+
*/
|
|
381
|
+
getAgents(workspaceId) {
|
|
382
|
+
const workspace = this.workspaces.get(workspaceId);
|
|
383
|
+
if (!workspace?.spawner)
|
|
384
|
+
return [];
|
|
385
|
+
return workspace.spawner.getActiveWorkers().map((w) => {
|
|
386
|
+
// Get health data for this agent
|
|
387
|
+
const health = this.getAgentHealth(workspaceId, w.name);
|
|
388
|
+
return {
|
|
389
|
+
id: w.name,
|
|
390
|
+
name: w.name,
|
|
391
|
+
workspaceId,
|
|
392
|
+
provider: this.detectProviderFromCli(w.cli),
|
|
393
|
+
status: 'running',
|
|
394
|
+
pid: w.pid,
|
|
395
|
+
task: w.task,
|
|
396
|
+
spawnedAt: new Date(w.spawnedAt),
|
|
397
|
+
lastHealthCheck: health?.lastHeartbeatAt,
|
|
398
|
+
rssBytes: health?.lastRssBytes,
|
|
399
|
+
cpuPercent: health?.lastCpuPercent,
|
|
400
|
+
restartCount: 0,
|
|
401
|
+
};
|
|
402
|
+
});
|
|
403
|
+
}
|
|
404
|
+
// === Private Methods ===
|
|
405
|
+
/**
|
|
406
|
+
* Start daemon for a workspace
|
|
407
|
+
*/
|
|
408
|
+
async startWorkspaceDaemon(workspaceId) {
|
|
409
|
+
const workspace = this.workspaces.get(workspaceId);
|
|
410
|
+
if (!workspace)
|
|
411
|
+
return;
|
|
412
|
+
if (workspace.daemon?.isRunning)
|
|
413
|
+
return;
|
|
414
|
+
try {
|
|
415
|
+
const paths = getProjectPaths(workspace.path);
|
|
416
|
+
workspace.daemon = new Daemon({
|
|
417
|
+
socketPath: paths.socketPath,
|
|
418
|
+
teamDir: paths.teamDir,
|
|
419
|
+
});
|
|
420
|
+
await workspace.daemon.start();
|
|
421
|
+
workspace.status = 'active';
|
|
422
|
+
// Create spawner
|
|
423
|
+
workspace.spawner = new AgentSpawner({
|
|
424
|
+
projectRoot: workspace.path,
|
|
425
|
+
onMarkSpawning: (name) => workspace.daemon?.markSpawning(name),
|
|
426
|
+
onClearSpawning: (name) => workspace.daemon?.clearSpawning(name),
|
|
427
|
+
});
|
|
428
|
+
// Set up agent death notifications
|
|
429
|
+
workspace.spawner.setOnAgentDeath((info) => {
|
|
430
|
+
// Broadcast to dashboard via WebSocket
|
|
431
|
+
this.broadcastEvent({
|
|
432
|
+
type: 'agent:crashed',
|
|
433
|
+
workspaceId,
|
|
434
|
+
data: {
|
|
435
|
+
name: info.name,
|
|
436
|
+
exitCode: info.exitCode,
|
|
437
|
+
continuityAgentId: info.agentId,
|
|
438
|
+
resumeInstructions: info.resumeInstructions,
|
|
439
|
+
},
|
|
440
|
+
timestamp: new Date(),
|
|
441
|
+
});
|
|
442
|
+
// Broadcast to all connected agents via relay
|
|
443
|
+
const message = info.agentId
|
|
444
|
+
? `AGENT DIED: "${info.name}" has crashed (exit code: ${info.exitCode}). Agent ID: ${info.agentId}. ${info.resumeInstructions}`
|
|
445
|
+
: `AGENT DIED: "${info.name}" has crashed (exit code: ${info.exitCode}).`;
|
|
446
|
+
workspace.daemon?.broadcastSystemMessage(message, {
|
|
447
|
+
agentName: info.name,
|
|
448
|
+
exitCode: info.exitCode,
|
|
449
|
+
agentId: info.agentId,
|
|
450
|
+
resumeInstructions: info.resumeInstructions,
|
|
451
|
+
});
|
|
452
|
+
logger.warn('Agent died', {
|
|
453
|
+
name: info.name,
|
|
454
|
+
exitCode: info.exitCode,
|
|
455
|
+
agentId: info.agentId,
|
|
456
|
+
});
|
|
457
|
+
});
|
|
458
|
+
logger.info('Workspace daemon started', { id: workspaceId, socket: paths.socketPath });
|
|
459
|
+
}
|
|
460
|
+
catch (err) {
|
|
461
|
+
workspace.status = 'error';
|
|
462
|
+
logger.error('Failed to start workspace daemon', { id: workspaceId, error: String(err) });
|
|
463
|
+
throw err;
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
/**
|
|
467
|
+
* Stop daemon for a workspace
|
|
468
|
+
*/
|
|
469
|
+
async stopWorkspaceDaemon(workspaceId) {
|
|
470
|
+
const workspace = this.workspaces.get(workspaceId);
|
|
471
|
+
if (!workspace)
|
|
472
|
+
return;
|
|
473
|
+
// Mark all agents as releasing to prevent crash announcements
|
|
474
|
+
const workspaceHealth = this.getWorkspaceAgentHealth(workspaceId);
|
|
475
|
+
for (const health of workspaceHealth) {
|
|
476
|
+
this.markAgentReleasing(workspaceId, health.agentName);
|
|
477
|
+
}
|
|
478
|
+
// Release all agents first
|
|
479
|
+
if (workspace.spawner) {
|
|
480
|
+
await workspace.spawner.releaseAll();
|
|
481
|
+
}
|
|
482
|
+
// Clean up health monitoring for all agents in this workspace
|
|
483
|
+
for (const health of workspaceHealth) {
|
|
484
|
+
this.unregisterAgentHealth(workspaceId, health.agentName);
|
|
485
|
+
}
|
|
486
|
+
// Stop daemon
|
|
487
|
+
if (workspace.daemon) {
|
|
488
|
+
await workspace.daemon.stop();
|
|
489
|
+
workspace.daemon = undefined;
|
|
490
|
+
}
|
|
491
|
+
workspace.spawner = undefined;
|
|
492
|
+
workspace.status = 'inactive';
|
|
493
|
+
logger.info('Workspace daemon stopped', { id: workspaceId });
|
|
494
|
+
}
|
|
495
|
+
/**
|
|
496
|
+
* Handle HTTP request
|
|
497
|
+
*/
|
|
498
|
+
async handleRequest(req, res) {
|
|
499
|
+
// CORS
|
|
500
|
+
res.setHeader('Access-Control-Allow-Origin', '*');
|
|
501
|
+
res.setHeader('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE, OPTIONS');
|
|
502
|
+
res.setHeader('Access-Control-Allow-Headers', 'Content-Type');
|
|
503
|
+
if (req.method === 'OPTIONS') {
|
|
504
|
+
res.writeHead(204);
|
|
505
|
+
res.end();
|
|
506
|
+
return;
|
|
507
|
+
}
|
|
508
|
+
const url = new URL(req.url || '/', `http://${req.headers.host}`);
|
|
509
|
+
const pathname = url.pathname;
|
|
510
|
+
const method = req.method || 'GET';
|
|
511
|
+
try {
|
|
512
|
+
let response;
|
|
513
|
+
// Health check
|
|
514
|
+
if (pathname === '/' && method === 'GET') {
|
|
515
|
+
response = { status: 200, body: { status: 'ok', version: '1.0.0' } };
|
|
516
|
+
}
|
|
517
|
+
// Metrics
|
|
518
|
+
else if (pathname === '/metrics' && method === 'GET') {
|
|
519
|
+
res.setHeader('Content-Type', 'text/plain');
|
|
520
|
+
res.writeHead(200);
|
|
521
|
+
res.end(metrics.toPrometheus());
|
|
522
|
+
return;
|
|
523
|
+
}
|
|
524
|
+
// List workspaces
|
|
525
|
+
else if (pathname === '/workspaces' && method === 'GET') {
|
|
526
|
+
response = {
|
|
527
|
+
status: 200,
|
|
528
|
+
body: {
|
|
529
|
+
workspaces: this.getWorkspaces(),
|
|
530
|
+
activeWorkspaceId: this.activeWorkspaceId,
|
|
531
|
+
},
|
|
532
|
+
};
|
|
533
|
+
}
|
|
534
|
+
// Add workspace
|
|
535
|
+
else if (pathname === '/workspaces' && method === 'POST') {
|
|
536
|
+
const body = await this.parseBody(req);
|
|
537
|
+
const workspace = this.addWorkspace(body);
|
|
538
|
+
response = { status: 201, body: workspace };
|
|
539
|
+
}
|
|
540
|
+
// Get workspace
|
|
541
|
+
else if (pathname.match(/^\/workspaces\/[^/]+$/) && method === 'GET') {
|
|
542
|
+
const id = pathname.split('/')[2];
|
|
543
|
+
const workspace = this.getWorkspace(id);
|
|
544
|
+
response = workspace
|
|
545
|
+
? { status: 200, body: workspace }
|
|
546
|
+
: { status: 404, body: { error: 'Not found' } };
|
|
547
|
+
}
|
|
548
|
+
// Delete workspace
|
|
549
|
+
else if (pathname.match(/^\/workspaces\/[^/]+$/) && method === 'DELETE') {
|
|
550
|
+
const id = pathname.split('/')[2];
|
|
551
|
+
const removed = await this.removeWorkspace(id);
|
|
552
|
+
response = removed
|
|
553
|
+
? { status: 204, body: null }
|
|
554
|
+
: { status: 404, body: { error: 'Not found' } };
|
|
555
|
+
}
|
|
556
|
+
// Switch workspace
|
|
557
|
+
else if (pathname.match(/^\/workspaces\/[^/]+\/switch$/) && method === 'POST') {
|
|
558
|
+
const id = pathname.split('/')[2];
|
|
559
|
+
const workspace = await this.switchWorkspace(id);
|
|
560
|
+
response = { status: 200, body: workspace };
|
|
561
|
+
}
|
|
562
|
+
// List agents in workspace
|
|
563
|
+
else if (pathname.match(/^\/workspaces\/[^/]+\/agents$/) && method === 'GET') {
|
|
564
|
+
const id = pathname.split('/')[2];
|
|
565
|
+
const agents = this.getAgents(id);
|
|
566
|
+
response = { status: 200, body: { agents, workspaceId: id } };
|
|
567
|
+
}
|
|
568
|
+
// Spawn agent
|
|
569
|
+
else if (pathname.match(/^\/workspaces\/[^/]+\/agents$/) && method === 'POST') {
|
|
570
|
+
const id = pathname.split('/')[2];
|
|
571
|
+
const body = await this.parseBody(req);
|
|
572
|
+
const agent = await this.spawnAgent(id, body);
|
|
573
|
+
response = { status: 201, body: agent };
|
|
574
|
+
}
|
|
575
|
+
// Stop agent
|
|
576
|
+
else if (pathname.match(/^\/workspaces\/[^/]+\/agents\/[^/]+$/) && method === 'DELETE') {
|
|
577
|
+
const parts = pathname.split('/');
|
|
578
|
+
const workspaceId = parts[2];
|
|
579
|
+
const agentName = parts[4];
|
|
580
|
+
const stopped = await this.stopAgent(workspaceId, agentName);
|
|
581
|
+
response = stopped
|
|
582
|
+
? { status: 204, body: null }
|
|
583
|
+
: { status: 404, body: { error: 'Not found' } };
|
|
584
|
+
}
|
|
585
|
+
// Not found
|
|
586
|
+
else {
|
|
587
|
+
response = { status: 404, body: { error: 'Not found' } };
|
|
588
|
+
}
|
|
589
|
+
res.setHeader('Content-Type', 'application/json');
|
|
590
|
+
res.writeHead(response.status);
|
|
591
|
+
res.end(response.body ? JSON.stringify(response.body) : '');
|
|
592
|
+
}
|
|
593
|
+
catch (err) {
|
|
594
|
+
logger.error('Request error', { error: String(err) });
|
|
595
|
+
res.setHeader('Content-Type', 'application/json');
|
|
596
|
+
res.writeHead(500);
|
|
597
|
+
res.end(JSON.stringify({ error: String(err) }));
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
/**
|
|
601
|
+
* Handle WebSocket connection
|
|
602
|
+
*/
|
|
603
|
+
handleWebSocket(ws, _req) {
|
|
604
|
+
logger.info('WebSocket client connected');
|
|
605
|
+
// Mark client as alive for ping/pong keepalive
|
|
606
|
+
this.clientAlive.set(ws, true);
|
|
607
|
+
// Handle pong responses
|
|
608
|
+
ws.on('pong', () => {
|
|
609
|
+
this.clientAlive.set(ws, true);
|
|
610
|
+
});
|
|
611
|
+
const session = {
|
|
612
|
+
userId: 'anonymous',
|
|
613
|
+
githubUsername: 'anonymous',
|
|
614
|
+
connectedAt: new Date(),
|
|
615
|
+
activeWorkspaceId: this.activeWorkspaceId,
|
|
616
|
+
};
|
|
617
|
+
this.sessions.set(ws, session);
|
|
618
|
+
// Send initial state
|
|
619
|
+
this.sendToClient(ws, {
|
|
620
|
+
type: 'init',
|
|
621
|
+
data: {
|
|
622
|
+
workspaces: this.getWorkspaces(),
|
|
623
|
+
activeWorkspaceId: this.activeWorkspaceId,
|
|
624
|
+
agents: this.activeWorkspaceId ? this.getAgents(this.activeWorkspaceId) : [],
|
|
625
|
+
},
|
|
626
|
+
});
|
|
627
|
+
ws.on('message', (data) => {
|
|
628
|
+
try {
|
|
629
|
+
const msg = JSON.parse(data.toString());
|
|
630
|
+
this.handleWebSocketMessage(ws, session, msg);
|
|
631
|
+
}
|
|
632
|
+
catch (err) {
|
|
633
|
+
logger.error('WebSocket message error', { error: String(err) });
|
|
634
|
+
}
|
|
635
|
+
});
|
|
636
|
+
ws.on('close', () => {
|
|
637
|
+
this.sessions.delete(ws);
|
|
638
|
+
logger.info('WebSocket client disconnected');
|
|
639
|
+
});
|
|
640
|
+
}
|
|
641
|
+
/**
|
|
642
|
+
* Handle WebSocket message
|
|
643
|
+
*/
|
|
644
|
+
handleWebSocketMessage(ws, session, msg) {
|
|
645
|
+
switch (msg.type) {
|
|
646
|
+
case 'switch_workspace':
|
|
647
|
+
if (typeof msg.data === 'string') {
|
|
648
|
+
this.switchWorkspace(msg.data)
|
|
649
|
+
.then((workspace) => {
|
|
650
|
+
session.activeWorkspaceId = workspace.id;
|
|
651
|
+
})
|
|
652
|
+
.catch((err) => {
|
|
653
|
+
this.sendToClient(ws, { type: 'error', data: String(err) });
|
|
654
|
+
});
|
|
655
|
+
}
|
|
656
|
+
break;
|
|
657
|
+
case 'ping':
|
|
658
|
+
this.sendToClient(ws, { type: 'pong' });
|
|
659
|
+
break;
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
/**
|
|
663
|
+
* Send to WebSocket client
|
|
664
|
+
*/
|
|
665
|
+
sendToClient(ws, msg) {
|
|
666
|
+
if (ws.readyState === WebSocket.OPEN) {
|
|
667
|
+
ws.send(JSON.stringify(msg));
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
/**
|
|
671
|
+
* Broadcast event to all clients
|
|
672
|
+
*/
|
|
673
|
+
broadcastEvent(event) {
|
|
674
|
+
if (!this.wss)
|
|
675
|
+
return;
|
|
676
|
+
const msg = JSON.stringify({ type: 'event', data: event });
|
|
677
|
+
for (const ws of this.wss.clients) {
|
|
678
|
+
if (ws.readyState === WebSocket.OPEN) {
|
|
679
|
+
ws.send(msg);
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
/**
|
|
684
|
+
* Parse request body
|
|
685
|
+
*/
|
|
686
|
+
parseBody(req) {
|
|
687
|
+
return new Promise((resolve, reject) => {
|
|
688
|
+
let data = '';
|
|
689
|
+
req.on('data', (chunk) => (data += chunk));
|
|
690
|
+
req.on('end', () => {
|
|
691
|
+
try {
|
|
692
|
+
resolve(data ? JSON.parse(data) : {});
|
|
693
|
+
}
|
|
694
|
+
catch {
|
|
695
|
+
reject(new Error('Invalid JSON'));
|
|
696
|
+
}
|
|
697
|
+
});
|
|
698
|
+
});
|
|
699
|
+
}
|
|
700
|
+
/**
|
|
701
|
+
* Load workspaces from disk
|
|
702
|
+
*/
|
|
703
|
+
loadWorkspaces() {
|
|
704
|
+
if (!fs.existsSync(this.workspacesFile))
|
|
705
|
+
return;
|
|
706
|
+
try {
|
|
707
|
+
const data = JSON.parse(fs.readFileSync(this.workspacesFile, 'utf8'));
|
|
708
|
+
for (const w of data.workspaces || []) {
|
|
709
|
+
this.workspaces.set(w.id, {
|
|
710
|
+
...w,
|
|
711
|
+
createdAt: new Date(w.createdAt),
|
|
712
|
+
lastActiveAt: new Date(w.lastActiveAt),
|
|
713
|
+
status: 'inactive',
|
|
714
|
+
});
|
|
715
|
+
}
|
|
716
|
+
this.activeWorkspaceId = data.activeWorkspaceId;
|
|
717
|
+
logger.info('Loaded workspaces', { count: this.workspaces.size });
|
|
718
|
+
}
|
|
719
|
+
catch (err) {
|
|
720
|
+
logger.error('Failed to load workspaces', { error: String(err) });
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
/**
|
|
724
|
+
* Save workspaces to disk
|
|
725
|
+
*/
|
|
726
|
+
saveWorkspaces() {
|
|
727
|
+
try {
|
|
728
|
+
const data = {
|
|
729
|
+
workspaces: Array.from(this.workspaces.values()).map((w) => this.toPublicWorkspace(w)),
|
|
730
|
+
activeWorkspaceId: this.activeWorkspaceId,
|
|
731
|
+
};
|
|
732
|
+
fs.writeFileSync(this.workspacesFile, JSON.stringify(data, null, 2));
|
|
733
|
+
}
|
|
734
|
+
catch (err) {
|
|
735
|
+
logger.error('Failed to save workspaces', { error: String(err) });
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
/**
|
|
739
|
+
* Find workspace by path
|
|
740
|
+
*/
|
|
741
|
+
findWorkspaceByPath(path) {
|
|
742
|
+
const resolved = this.resolvePath(path);
|
|
743
|
+
const workspace = Array.from(this.workspaces.values()).find((w) => w.path === resolved);
|
|
744
|
+
return workspace ? this.toPublicWorkspace(workspace) : undefined;
|
|
745
|
+
}
|
|
746
|
+
/**
|
|
747
|
+
* Resolve path
|
|
748
|
+
*/
|
|
749
|
+
resolvePath(p) {
|
|
750
|
+
if (p.startsWith('~')) {
|
|
751
|
+
p = path.join(process.env.HOME || '', p.slice(1));
|
|
752
|
+
}
|
|
753
|
+
return path.resolve(p);
|
|
754
|
+
}
|
|
755
|
+
/**
|
|
756
|
+
* Detect provider from workspace
|
|
757
|
+
*/
|
|
758
|
+
detectProvider(workspacePath) {
|
|
759
|
+
if (fs.existsSync(path.join(workspacePath, 'CLAUDE.md')) ||
|
|
760
|
+
fs.existsSync(path.join(workspacePath, '.claude'))) {
|
|
761
|
+
return 'claude';
|
|
762
|
+
}
|
|
763
|
+
if (fs.existsSync(path.join(workspacePath, '.codex'))) {
|
|
764
|
+
return 'codex';
|
|
765
|
+
}
|
|
766
|
+
if (fs.existsSync(path.join(workspacePath, '.gemini'))) {
|
|
767
|
+
return 'gemini';
|
|
768
|
+
}
|
|
769
|
+
return 'generic';
|
|
770
|
+
}
|
|
771
|
+
/**
|
|
772
|
+
* Detect provider from CLI command
|
|
773
|
+
*/
|
|
774
|
+
detectProviderFromCli(cli) {
|
|
775
|
+
if (cli.includes('claude'))
|
|
776
|
+
return 'claude';
|
|
777
|
+
if (cli.includes('codex'))
|
|
778
|
+
return 'codex';
|
|
779
|
+
if (cli.includes('gemini'))
|
|
780
|
+
return 'gemini';
|
|
781
|
+
return 'generic';
|
|
782
|
+
}
|
|
783
|
+
/**
|
|
784
|
+
* Get CLI command for provider
|
|
785
|
+
*/
|
|
786
|
+
getCliForProvider(provider) {
|
|
787
|
+
switch (provider) {
|
|
788
|
+
case 'claude':
|
|
789
|
+
return 'claude';
|
|
790
|
+
case 'codex':
|
|
791
|
+
return 'codex';
|
|
792
|
+
case 'gemini':
|
|
793
|
+
return 'gemini';
|
|
794
|
+
default:
|
|
795
|
+
return 'claude';
|
|
796
|
+
}
|
|
797
|
+
}
|
|
798
|
+
/**
|
|
799
|
+
* Get git info
|
|
800
|
+
*/
|
|
801
|
+
getGitInfo(workspacePath) {
|
|
802
|
+
try {
|
|
803
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
804
|
+
const { execSync } = require('child_process');
|
|
805
|
+
const branch = execSync('git branch --show-current', {
|
|
806
|
+
cwd: workspacePath,
|
|
807
|
+
encoding: 'utf8',
|
|
808
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
809
|
+
}).trim();
|
|
810
|
+
let remote;
|
|
811
|
+
try {
|
|
812
|
+
remote = execSync('git remote get-url origin', {
|
|
813
|
+
cwd: workspacePath,
|
|
814
|
+
encoding: 'utf8',
|
|
815
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
816
|
+
}).trim();
|
|
817
|
+
}
|
|
818
|
+
catch {
|
|
819
|
+
// No remote
|
|
820
|
+
}
|
|
821
|
+
return { gitRemote: remote, gitBranch: branch };
|
|
822
|
+
}
|
|
823
|
+
catch {
|
|
824
|
+
return {};
|
|
825
|
+
}
|
|
826
|
+
}
|
|
827
|
+
/**
|
|
828
|
+
* Convert to public workspace (without internal references)
|
|
829
|
+
*/
|
|
830
|
+
toPublicWorkspace(w) {
|
|
831
|
+
return {
|
|
832
|
+
id: w.id,
|
|
833
|
+
name: w.name,
|
|
834
|
+
path: w.path,
|
|
835
|
+
status: w.status,
|
|
836
|
+
provider: w.provider,
|
|
837
|
+
createdAt: w.createdAt,
|
|
838
|
+
lastActiveAt: w.lastActiveAt,
|
|
839
|
+
cloudId: w.cloudId,
|
|
840
|
+
customDomain: w.customDomain,
|
|
841
|
+
gitRemote: w.gitRemote,
|
|
842
|
+
gitBranch: w.gitBranch,
|
|
843
|
+
};
|
|
844
|
+
}
|
|
845
|
+
// === Health Monitoring ===
|
|
846
|
+
/**
|
|
847
|
+
* Start agent health monitoring.
|
|
848
|
+
* Monitors PIDs for liveness and tracks memory/CPU usage.
|
|
849
|
+
*/
|
|
850
|
+
startHealthMonitoring() {
|
|
851
|
+
// Start the memory monitor
|
|
852
|
+
this.memoryMonitor.start();
|
|
853
|
+
// Listen for memory samples to update health state
|
|
854
|
+
// Store handler reference for cleanup
|
|
855
|
+
this.memorySampleHandler = (event) => {
|
|
856
|
+
const health = this.agentHealth.get(event.name);
|
|
857
|
+
if (health) {
|
|
858
|
+
health.lastSampleAt = new Date();
|
|
859
|
+
health.lastRssBytes = event.snapshot.rssBytes;
|
|
860
|
+
health.lastCpuPercent = event.snapshot.cpuPercent;
|
|
861
|
+
// Check for high CPU usage and broadcast alert
|
|
862
|
+
if (event.snapshot.cpuPercent >= CPU_ALERT_THRESHOLD) {
|
|
863
|
+
this.broadcastResourceAlert(health, 'cpu', event.snapshot.cpuPercent);
|
|
864
|
+
}
|
|
865
|
+
}
|
|
866
|
+
};
|
|
867
|
+
this.memoryMonitor.on('sample', this.memorySampleHandler);
|
|
868
|
+
// Listen for memory alerts and broadcast to agents
|
|
869
|
+
// Store handler reference for cleanup
|
|
870
|
+
this.memoryAlertHandler = (alert) => {
|
|
871
|
+
const health = this.agentHealth.get(alert.agentName);
|
|
872
|
+
if (health && alert.type !== 'recovered') {
|
|
873
|
+
this.broadcastResourceAlert(health, 'memory', alert.currentRss, alert);
|
|
874
|
+
}
|
|
875
|
+
};
|
|
876
|
+
this.memoryMonitor.on('alert', this.memoryAlertHandler);
|
|
877
|
+
// Start heartbeat interval to check PIDs are alive
|
|
878
|
+
this.heartbeatInterval = setInterval(() => {
|
|
879
|
+
this.checkAgentHeartbeats();
|
|
880
|
+
}, HEARTBEAT_INTERVAL_MS);
|
|
881
|
+
logger.info('Health monitoring started', {
|
|
882
|
+
heartbeatIntervalMs: HEARTBEAT_INTERVAL_MS,
|
|
883
|
+
cpuAlertThreshold: CPU_ALERT_THRESHOLD,
|
|
884
|
+
});
|
|
885
|
+
}
|
|
886
|
+
/**
|
|
887
|
+
* Check all registered agents' PIDs are still alive.
|
|
888
|
+
* If a PID has died unexpectedly, broadcast a crash notification.
|
|
889
|
+
*/
|
|
890
|
+
checkAgentHeartbeats() {
|
|
891
|
+
// Collect crashed agents first to avoid modifying map during iteration
|
|
892
|
+
const crashedAgents = [];
|
|
893
|
+
for (const [key, health] of this.agentHealth) {
|
|
894
|
+
const isAlive = this.isProcessAlive(health.pid);
|
|
895
|
+
if (isAlive) {
|
|
896
|
+
// Only update heartbeat timestamp for alive processes
|
|
897
|
+
health.lastHeartbeatAt = new Date();
|
|
898
|
+
}
|
|
899
|
+
else if (!health.releasing) {
|
|
900
|
+
// Agent died unexpectedly - mark for crash handling
|
|
901
|
+
// Immediately remove from map to prevent duplicate handling on next interval
|
|
902
|
+
this.agentHealth.delete(key);
|
|
903
|
+
crashedAgents.push(health);
|
|
904
|
+
}
|
|
905
|
+
// If !isAlive && health.releasing, agent is being gracefully stopped - skip
|
|
906
|
+
}
|
|
907
|
+
// Now handle crashes outside the iteration
|
|
908
|
+
for (const health of crashedAgents) {
|
|
909
|
+
logger.warn('Agent heartbeat failed - process died', {
|
|
910
|
+
workspaceId: health.workspaceId,
|
|
911
|
+
agentName: health.agentName,
|
|
912
|
+
pid: health.pid,
|
|
913
|
+
});
|
|
914
|
+
this.handleAgentCrash(health);
|
|
915
|
+
}
|
|
916
|
+
}
|
|
917
|
+
/**
|
|
918
|
+
* Check if a process is alive by sending signal 0.
|
|
919
|
+
*/
|
|
920
|
+
isProcessAlive(pid) {
|
|
921
|
+
try {
|
|
922
|
+
process.kill(pid, 0);
|
|
923
|
+
return true;
|
|
924
|
+
}
|
|
925
|
+
catch {
|
|
926
|
+
return false;
|
|
927
|
+
}
|
|
928
|
+
}
|
|
929
|
+
/**
|
|
930
|
+
* Handle an agent crash - unregister and broadcast to other agents.
|
|
931
|
+
* Note: Agent is already removed from agentHealth map before this is called.
|
|
932
|
+
*/
|
|
933
|
+
handleAgentCrash(health) {
|
|
934
|
+
const workspace = this.workspaces.get(health.workspaceId);
|
|
935
|
+
// Get crash context from memory monitor for analysis
|
|
936
|
+
const crashContext = this.memoryMonitor.getCrashContext(health.agentName);
|
|
937
|
+
// Unregister from memory monitor (agent already removed from agentHealth map)
|
|
938
|
+
this.memoryMonitor.unregister(health.agentName);
|
|
939
|
+
// Broadcast crash to dashboard via WebSocket
|
|
940
|
+
this.broadcastEvent({
|
|
941
|
+
type: 'agent:crashed',
|
|
942
|
+
workspaceId: health.workspaceId,
|
|
943
|
+
data: {
|
|
944
|
+
name: health.agentName,
|
|
945
|
+
pid: health.pid,
|
|
946
|
+
crashContext: {
|
|
947
|
+
likelyCause: crashContext.likelyCause,
|
|
948
|
+
peakMemory: crashContext.peakMemory,
|
|
949
|
+
averageMemory: crashContext.averageMemory,
|
|
950
|
+
memoryTrend: crashContext.memoryTrend,
|
|
951
|
+
analysisNotes: crashContext.analysisNotes,
|
|
952
|
+
},
|
|
953
|
+
},
|
|
954
|
+
timestamp: new Date(),
|
|
955
|
+
});
|
|
956
|
+
// Broadcast to all connected agents in the workspace via relay
|
|
957
|
+
const message = crashContext.likelyCause !== 'unknown'
|
|
958
|
+
? `AGENT CRASHED: "${health.agentName}" has died unexpectedly (PID: ${health.pid}). Likely cause: ${crashContext.likelyCause}. ${crashContext.analysisNotes.slice(0, 2).join('. ')}`
|
|
959
|
+
: `AGENT CRASHED: "${health.agentName}" has died unexpectedly (PID: ${health.pid}).`;
|
|
960
|
+
workspace?.daemon?.broadcastSystemMessage(message, {
|
|
961
|
+
agentName: health.agentName,
|
|
962
|
+
pid: health.pid,
|
|
963
|
+
likelyCause: crashContext.likelyCause,
|
|
964
|
+
crashType: 'heartbeat_failure',
|
|
965
|
+
});
|
|
966
|
+
logger.error('Agent crashed', {
|
|
967
|
+
workspaceId: health.workspaceId,
|
|
968
|
+
agentName: health.agentName,
|
|
969
|
+
pid: health.pid,
|
|
970
|
+
likelyCause: crashContext.likelyCause,
|
|
971
|
+
});
|
|
972
|
+
}
|
|
973
|
+
/**
|
|
974
|
+
* Broadcast a resource alert (memory or CPU) to agents.
|
|
975
|
+
*/
|
|
976
|
+
broadcastResourceAlert(health, resourceType, currentValue, memoryAlert) {
|
|
977
|
+
// CPU alert cooldown to avoid spamming
|
|
978
|
+
if (resourceType === 'cpu') {
|
|
979
|
+
const now = Date.now();
|
|
980
|
+
if (health.lastCpuAlertAt && now - health.lastCpuAlertAt < RESOURCE_ALERT_COOLDOWN_MS) {
|
|
981
|
+
return; // Still in cooldown
|
|
982
|
+
}
|
|
983
|
+
health.lastCpuAlertAt = now;
|
|
984
|
+
}
|
|
985
|
+
const workspace = this.workspaces.get(health.workspaceId);
|
|
986
|
+
// Broadcast to dashboard
|
|
987
|
+
this.broadcastEvent({
|
|
988
|
+
type: 'agent:resource-alert',
|
|
989
|
+
workspaceId: health.workspaceId,
|
|
990
|
+
agentId: health.agentName,
|
|
991
|
+
data: {
|
|
992
|
+
name: health.agentName,
|
|
993
|
+
resourceType,
|
|
994
|
+
currentValue,
|
|
995
|
+
alertLevel: memoryAlert?.type ?? 'high_cpu',
|
|
996
|
+
message: memoryAlert?.message ??
|
|
997
|
+
`Agent "${health.agentName}" is running at ${currentValue.toFixed(1)}% CPU`,
|
|
998
|
+
recommendation: memoryAlert?.recommendation ??
|
|
999
|
+
'Consider reducing workload or checking for runaway processes',
|
|
1000
|
+
},
|
|
1001
|
+
timestamp: new Date(),
|
|
1002
|
+
});
|
|
1003
|
+
// Broadcast to agents
|
|
1004
|
+
const message = resourceType === 'memory'
|
|
1005
|
+
? `RESOURCE ALERT: "${health.agentName}" memory usage is ${memoryAlert?.type ?? 'high'} (${formatBytes(currentValue)}). ${memoryAlert?.recommendation ?? ''}`
|
|
1006
|
+
: `RESOURCE ALERT: "${health.agentName}" is running at ${currentValue.toFixed(1)}% CPU. Consider reducing workload.`;
|
|
1007
|
+
workspace?.daemon?.broadcastSystemMessage(message, {
|
|
1008
|
+
agentName: health.agentName,
|
|
1009
|
+
resourceType,
|
|
1010
|
+
alertLevel: memoryAlert?.type ?? 'high_cpu',
|
|
1011
|
+
});
|
|
1012
|
+
logger.warn('Resource alert', {
|
|
1013
|
+
workspaceId: health.workspaceId,
|
|
1014
|
+
agentName: health.agentName,
|
|
1015
|
+
resourceType,
|
|
1016
|
+
currentValue: resourceType === 'memory' ? formatBytes(currentValue) : `${currentValue.toFixed(1)}%`,
|
|
1017
|
+
alertLevel: memoryAlert?.type ?? 'high_cpu',
|
|
1018
|
+
});
|
|
1019
|
+
}
|
|
1020
|
+
/**
|
|
1021
|
+
* Register an agent for health monitoring.
|
|
1022
|
+
*/
|
|
1023
|
+
registerAgentHealth(workspaceId, agentName, pid) {
|
|
1024
|
+
const key = `${workspaceId}:${agentName}`;
|
|
1025
|
+
// Guard against double-registration - update PID instead
|
|
1026
|
+
if (this.agentHealth.has(key)) {
|
|
1027
|
+
logger.warn('Agent already registered for health monitoring, updating PID', {
|
|
1028
|
+
workspaceId,
|
|
1029
|
+
agentName,
|
|
1030
|
+
newPid: pid,
|
|
1031
|
+
});
|
|
1032
|
+
this.updateAgentHealthPid(workspaceId, agentName, pid);
|
|
1033
|
+
return;
|
|
1034
|
+
}
|
|
1035
|
+
this.agentHealth.set(key, {
|
|
1036
|
+
key,
|
|
1037
|
+
workspaceId,
|
|
1038
|
+
agentName,
|
|
1039
|
+
pid,
|
|
1040
|
+
lastHeartbeatAt: new Date(),
|
|
1041
|
+
});
|
|
1042
|
+
// Register with memory monitor
|
|
1043
|
+
this.memoryMonitor.register(agentName, pid);
|
|
1044
|
+
logger.info('Agent registered for health monitoring', {
|
|
1045
|
+
workspaceId,
|
|
1046
|
+
agentName,
|
|
1047
|
+
pid,
|
|
1048
|
+
});
|
|
1049
|
+
}
|
|
1050
|
+
/**
|
|
1051
|
+
* Update PID for an agent (after restart).
|
|
1052
|
+
*
|
|
1053
|
+
* This method is intended for agent restart scenarios where the agent process
|
|
1054
|
+
* is restarted with a new PID but should maintain continuity in health tracking.
|
|
1055
|
+
* Currently unused but reserved for future auto-restart functionality.
|
|
1056
|
+
*
|
|
1057
|
+
* @param workspaceId - The workspace ID
|
|
1058
|
+
* @param agentName - The agent name
|
|
1059
|
+
* @param newPid - The new process ID after restart
|
|
1060
|
+
*/
|
|
1061
|
+
updateAgentHealthPid(workspaceId, agentName, newPid) {
|
|
1062
|
+
const key = `${workspaceId}:${agentName}`;
|
|
1063
|
+
const health = this.agentHealth.get(key);
|
|
1064
|
+
if (health) {
|
|
1065
|
+
health.pid = newPid;
|
|
1066
|
+
health.releasing = false;
|
|
1067
|
+
health.lastHeartbeatAt = new Date();
|
|
1068
|
+
this.memoryMonitor.updatePid(agentName, newPid);
|
|
1069
|
+
logger.info('Agent health PID updated', {
|
|
1070
|
+
workspaceId,
|
|
1071
|
+
agentName,
|
|
1072
|
+
newPid,
|
|
1073
|
+
});
|
|
1074
|
+
}
|
|
1075
|
+
else {
|
|
1076
|
+
// Register new
|
|
1077
|
+
this.registerAgentHealth(workspaceId, agentName, newPid);
|
|
1078
|
+
}
|
|
1079
|
+
}
|
|
1080
|
+
/**
|
|
1081
|
+
* Mark an agent as releasing (to avoid crash announcement).
|
|
1082
|
+
*/
|
|
1083
|
+
markAgentReleasing(workspaceId, agentName) {
|
|
1084
|
+
const key = `${workspaceId}:${agentName}`;
|
|
1085
|
+
const health = this.agentHealth.get(key);
|
|
1086
|
+
if (health) {
|
|
1087
|
+
health.releasing = true;
|
|
1088
|
+
logger.debug('Agent marked as releasing', { workspaceId, agentName });
|
|
1089
|
+
}
|
|
1090
|
+
}
|
|
1091
|
+
/**
|
|
1092
|
+
* Unregister an agent from health monitoring.
|
|
1093
|
+
*/
|
|
1094
|
+
unregisterAgentHealth(workspaceId, agentName) {
|
|
1095
|
+
const key = `${workspaceId}:${agentName}`;
|
|
1096
|
+
this.agentHealth.delete(key);
|
|
1097
|
+
this.memoryMonitor.unregister(agentName);
|
|
1098
|
+
logger.debug('Agent unregistered from health monitoring', {
|
|
1099
|
+
workspaceId,
|
|
1100
|
+
agentName,
|
|
1101
|
+
});
|
|
1102
|
+
}
|
|
1103
|
+
/**
|
|
1104
|
+
* Get health state for an agent.
|
|
1105
|
+
*/
|
|
1106
|
+
getAgentHealth(workspaceId, agentName) {
|
|
1107
|
+
return this.agentHealth.get(`${workspaceId}:${agentName}`);
|
|
1108
|
+
}
|
|
1109
|
+
/**
|
|
1110
|
+
* Get health states for all agents in a workspace.
|
|
1111
|
+
*/
|
|
1112
|
+
getWorkspaceAgentHealth(workspaceId) {
|
|
1113
|
+
return Array.from(this.agentHealth.values()).filter((h) => h.workspaceId === workspaceId);
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
let orchestratorInstance;
|
|
1117
|
+
/**
|
|
1118
|
+
* Start the orchestrator
|
|
1119
|
+
*/
|
|
1120
|
+
export async function startOrchestrator(config = {}) {
|
|
1121
|
+
if (orchestratorInstance) {
|
|
1122
|
+
return orchestratorInstance;
|
|
1123
|
+
}
|
|
1124
|
+
orchestratorInstance = new Orchestrator(config);
|
|
1125
|
+
await orchestratorInstance.start();
|
|
1126
|
+
return orchestratorInstance;
|
|
1127
|
+
}
|
|
1128
|
+
/**
|
|
1129
|
+
* Stop the orchestrator
|
|
1130
|
+
*/
|
|
1131
|
+
export async function stopOrchestrator() {
|
|
1132
|
+
if (orchestratorInstance) {
|
|
1133
|
+
await orchestratorInstance.stop();
|
|
1134
|
+
orchestratorInstance = undefined;
|
|
1135
|
+
}
|
|
1136
|
+
}
|
|
1137
|
+
/**
|
|
1138
|
+
* Get orchestrator instance
|
|
1139
|
+
*/
|
|
1140
|
+
export function getOrchestrator() {
|
|
1141
|
+
return orchestratorInstance;
|
|
1142
|
+
}
|
|
1143
|
+
//# sourceMappingURL=orchestrator.js.map
|