@hamp10/agentforge 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/worker.js ADDED
@@ -0,0 +1,1791 @@
1
+ #!/usr/bin/env node
2
+
3
+ import WebSocket from 'ws';
4
+ import { OpenClawCLI } from './OpenClawCLI.js';
5
+ import { HampAgentCLI } from './HampAgentCLI.js';
6
+ import { OllamaAgent } from './OllamaAgent.js';
7
+ import EventEmitter from 'events';
8
+ import path from 'path';
9
+ import { existsSync, readdirSync, readFileSync, mkdirSync, writeFileSync, copyFileSync, statSync, unlinkSync } from 'fs';
10
+ import { fileURLToPath } from 'url';
11
+ import { homedir, hostname } from 'os';
12
+ import { spawn } from 'child_process';
13
+
14
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
15
+
16
+ export class AgentForgeWorker extends EventEmitter {
17
+ constructor(token, railwayUrl = 'wss://agentforgeai-production.up.railway.app/socket', workerConfig = {}) {
18
+ super();
19
+ this.token = token;
20
+ this.railwayUrl = railwayUrl;
21
+ this.ws = null;
22
+
23
+ // Pick agent backend from config
24
+ if (workerConfig.provider === 'local') {
25
+ const url = workerConfig.localUrl || 'http://localhost:11434';
26
+ const model = workerConfig.localModel || 'llama3.1:8b';
27
+ console.log(`đŸĻ™ Using local model backend: ${url} / ${model}`);
28
+ this.cli = new OllamaAgent(url, model);
29
+ } else if (OpenClawCLI.isAvailable()) {
30
+ this.cli = new OpenClawCLI();
31
+ // Wire in the OpenClaw Gateway streaming config so OpenClawCLI can use
32
+ // per-token SSE streaming instead of waiting for subprocess to exit
33
+ try {
34
+ const cfgPath = path.join(homedir(), '.openclaw', 'openclaw.json');
35
+ if (existsSync(cfgPath)) {
36
+ const cfg = JSON.parse(readFileSync(cfgPath, 'utf-8'));
37
+ const port = cfg?.gateway?.port || 18789;
38
+ const token = cfg?.gateway?.auth?.token;
39
+ if (port && token) {
40
+ this.cli.gatewayPort = port;
41
+ this.cli.gatewayToken = token;
42
+ console.log(`🌊 OpenClaw Gateway streaming enabled (port ${port})`);
43
+ }
44
+ }
45
+ } catch (err) {
46
+ console.warn(`âš ī¸ Could not load gateway config for streaming: ${err.message}`);
47
+ }
48
+ } else {
49
+ // openclaw not installed — cannot run without a configured backend
50
+ console.error('');
51
+ console.error('❌ No AI backend configured.');
52
+ console.error('');
53
+ console.error(' AgentForge needs an AI model to run agents.');
54
+ console.error(' Configure a local model server (Ollama, LM Studio, Jan, etc.):');
55
+ console.error('');
56
+ console.error(' agentforge local --url http://localhost:11434 --model llama3.1:8b');
57
+ console.error('');
58
+ console.error(' Then run: agentforge start');
59
+ console.error('');
60
+ process.exit(1);
61
+ }
62
+ // Hampagent — always available alongside openclaw
63
+ this.hampagent = new HampAgentCLI();
64
+
65
+ this.activeAgents = new Map();
66
+ this.reconnectAttempts = 0;
67
+ this.maxReconnectAttempts = 10;
68
+
69
+ // Per-agent task queues to prevent concurrent openclaw processes
70
+ this.agentQueues = new Map(); // agentId -> array of tasks
71
+ this.agentProcessing = new Map(); // agentId -> boolean (is currently processing)
72
+ this.processingStartTime = new Map(); // agentId -> timestamp when processing started
73
+ this.PROCESSING_TIMEOUT_MS = 30 * 60 * 1000; // 30 minutes max for stale processing state (large projects with Opus can be slow)
74
+
75
+ // Track running tasks for cancellation
76
+ this.runningTasks = new Map(); // taskId -> { agentId, cancelled }
77
+
78
+ // Queue for messages that couldn't be sent while disconnected
79
+ this.pendingMessages = [];
80
+ this.maxPendingMessages = 100; // Prevent unbounded growth
81
+
82
+ // Track recently sent completions to prevent duplicates
83
+ this.recentCompletions = new Set();
84
+ this.completionTTL = 30000; // 30 seconds
85
+
86
+ // Track agent activity for stuck detection
87
+ this.lastAgentActivity = new Map(); // agentId -> timestamp
88
+ this.pingsSinceActivity = new Map(); // agentId -> count
89
+ this.STUCK_PING_THRESHOLD = 2; // 2 pings with no activity = stuck (~60s since server pings every 30s)
90
+ }
91
+
92
+ speakTextOutLoud(utterance) {
93
+ if (!utterance || typeof utterance !== 'string') return;
94
+ const text = utterance.trim();
95
+ if (!text) return;
96
+
97
+ if (process.platform !== 'darwin') {
98
+ console.log('🔇 TTS requested but platform is not macOS; skipping local audio playback.');
99
+ return;
100
+ }
101
+
102
+ try {
103
+ const volumeProc = spawn('osascript', ['-e', 'set volume without output muted', '-e', 'set volume output volume 80'], {
104
+ stdio: 'ignore'
105
+ });
106
+ volumeProc.on('error', (err) => console.warn('âš ī¸ Unable to adjust volume:', err.message));
107
+ } catch (err) {
108
+ console.warn('âš ī¸ Volume adjustment failed:', err.message);
109
+ }
110
+
111
+ try {
112
+ const sayProc = spawn('say', [text], { stdio: 'ignore' });
113
+ sayProc.on('error', (err) => console.warn('âš ī¸ "say" command failed:', err.message));
114
+ console.log(`🔊 Speaking aloud: "${text.slice(0, 80)}${text.length > 80 ? 'â€Ļ' : ''}"`);
115
+ } catch (err) {
116
+ console.warn('âš ī¸ Unable to invoke "say":', err.message);
117
+ }
118
+ }
119
+
120
+ extractSpeechText(toolInput) {
121
+ if (!toolInput) return '';
122
+
123
+ const preferredKeys = ['text', 'input', 'message', 'utterance', 'prompt', 'transcript', 'content'];
124
+ const visited = new Set();
125
+
126
+ const readValue = (value) => {
127
+ if (value == null) return '';
128
+ if (typeof value === 'string') {
129
+ const trimmed = value.trim();
130
+ return trimmed.length > 0 ? trimmed : '';
131
+ }
132
+ if (typeof value === 'number') {
133
+ return String(value);
134
+ }
135
+ if (Array.isArray(value)) {
136
+ for (const item of value) {
137
+ const result = readValue(item);
138
+ if (result) return result;
139
+ }
140
+ return '';
141
+ }
142
+ if (typeof value === 'object') {
143
+ if (visited.has(value)) return '';
144
+ visited.add(value);
145
+
146
+ if (typeof value.text === 'string' && value.text.trim()) {
147
+ return value.text.trim();
148
+ }
149
+ if (value.type === 'text' && typeof value.text === 'string' && value.text.trim()) {
150
+ return value.text.trim();
151
+ }
152
+
153
+ for (const key of preferredKeys) {
154
+ if (key in value) {
155
+ const result = readValue(value[key]);
156
+ if (result) return result;
157
+ }
158
+ }
159
+
160
+ for (const nested of Object.values(value)) {
161
+ const result = readValue(nested);
162
+ if (result) return result;
163
+ }
164
+ }
165
+ return '';
166
+ };
167
+
168
+ return readValue(toolInput);
169
+ }
170
+
171
+ async initialize() {
172
+ this._killOrphanedAgents();
173
+ this.installPreviewServer();
174
+ this._startAutoUpdateCheck();
175
+ console.log('✅ Worker initialized');
176
+ }
177
+
178
+ _killOrphanedAgents() {
179
+ // Kill any openclaw agent processes left over from a previous worker session.
180
+ // Without this, orphaned processes reconnect to the gateway and block the task queue.
181
+ for (const name of ['openclaw-agent', 'openclaw-gateway']) {
182
+ try {
183
+ const p = spawn('pkill', ['-f', name], { stdio: 'ignore' });
184
+ p.on('close', (code) => {
185
+ if (code === 0) console.log(`🧹 Killed orphaned ${name} processes`);
186
+ });
187
+ } catch {}
188
+ }
189
+ }
190
+
191
+ /**
192
+ * Guard against bloated OpenClaw sessions causing API failures.
193
+ * OpenClaw sessions accumulate ALL messages (tool calls, results, thinking blocks).
194
+ * Once a session hits ~400KB the Anthropic API starts returning internal server errors
195
+ * and the agent deadlocks — it can't respond or terminate.
196
+ *
197
+ * Strategy: keep the most recent messages from the session JSONL so the agent
198
+ * retains recent context (roughly the last N turns), rather than wiping it entirely.
199
+ * This minimises context loss while preventing unbounded growth.
200
+ */
201
+ _guardOpenClawSession(agentId) {
202
+ const MAX_SESSION_BYTES = 400_000; // ~400KB — trim above this
203
+ const KEEP_BYTES = 150_000; // keep last ~150KB of messages
204
+ const sessionsDir = path.join(homedir(), '.openclaw', 'agents', agentId, 'sessions');
205
+ if (!existsSync(sessionsDir)) return;
206
+
207
+ let trimmed = false;
208
+ for (const file of readdirSync(sessionsDir)) {
209
+ if (!file.endsWith('.jsonl')) continue;
210
+ const filePath = path.join(sessionsDir, file);
211
+ let size;
212
+ try { size = statSync(filePath).size; } catch { continue; }
213
+ if (size <= MAX_SESSION_BYTES) continue;
214
+
215
+ // Trim: keep the session header line(s) + the tail of the file
216
+ try {
217
+ const raw = readFileSync(filePath, 'utf-8');
218
+ const lines = raw.split('\n').filter(l => l.trim());
219
+ // Always keep the first line (session metadata)
220
+ const header = lines[0] || '';
221
+ // Rebuild from the tail until we have roughly KEEP_BYTES
222
+ const tail = [];
223
+ let kept = 0;
224
+ for (let i = lines.length - 1; i >= 1; i--) {
225
+ kept += Buffer.byteLength(lines[i], 'utf-8') + 1;
226
+ if (kept > KEEP_BYTES) break;
227
+ tail.unshift(lines[i]);
228
+ }
229
+ const trimmedContent = [header, ...tail].join('\n') + '\n';
230
+ writeFileSync(filePath, trimmedContent, 'utf-8');
231
+ console.log(`âœ‚ī¸ [${agentId}] Trimmed session ${file} from ${Math.round(size/1024)}KB → ${Math.round(Buffer.byteLength(trimmedContent,'utf-8')/1024)}KB (kept last ${tail.length} msgs)`);
232
+ trimmed = true;
233
+ } catch (e) {
234
+ // If trimming fails, delete entirely — a fresh session is better than a deadlock
235
+ try { unlinkSync(filePath); console.log(`đŸ—‘ī¸ [${agentId}] Deleted oversized session ${file} (${Math.round(size/1024)}KB) — trim failed`); } catch {}
236
+ }
237
+ }
238
+ return trimmed;
239
+ }
240
+
241
+ installPreviewServer() {
242
+ try {
243
+ const src = path.join(__dirname, 'preview-server.js');
244
+ const destDir = path.join(homedir(), '.agentforge');
245
+ const dest = path.join(destDir, 'preview-server.js');
246
+ if (!existsSync(destDir)) mkdirSync(destDir, { recursive: true });
247
+ copyFileSync(src, dest);
248
+ console.log('🔍 Preview server installed at ~/.agentforge/preview-server.js');
249
+ } catch (e) {
250
+ console.warn('âš ī¸ Could not install preview server:', e.message);
251
+ }
252
+ }
253
+
254
+ async connect() {
255
+ // Start periodic queue health check (every 10 seconds)
256
+ if (!this.queueHealthInterval) {
257
+ this.queueHealthInterval = setInterval(() => {
258
+ // Log current state
259
+ const processing = [];
260
+ const queued = [];
261
+ for (const [agentId, isProcessing] of this.agentProcessing.entries()) {
262
+ if (isProcessing) {
263
+ const elapsed = Date.now() - (this.processingStartTime.get(agentId) || Date.now());
264
+ processing.push(`${agentId.slice(-8)}(${Math.round(elapsed/1000)}s)`);
265
+ }
266
+ }
267
+ for (const [agentId, queue] of this.agentQueues.entries()) {
268
+ if (queue.length > 0) {
269
+ queued.push(`${agentId.slice(-8)}:${queue.length}`);
270
+ }
271
+ }
272
+ if (processing.length > 0 || queued.length > 0) {
273
+ console.log(`📊 State: processing=[${processing.join(',')}] queued=[${queued.join(',')}]`);
274
+ }
275
+
276
+ this.resetStaleProcessingStates();
277
+ this.processAllQueues();
278
+ }, 10000);
279
+ }
280
+
281
+ return new Promise((resolve, reject) => {
282
+ console.log(`🔄 Connecting to ${this.railwayUrl}...`);
283
+
284
+ this.ws = new WebSocket(this.railwayUrl, {
285
+ headers: {
286
+ 'Authorization': `Bearer ${this.token}`,
287
+ 'X-Worker-Type': 'agentforge'
288
+ }
289
+ });
290
+
291
+ this.ws.on('open', () => {
292
+ console.log('✅ Connected to AgentForge');
293
+ const wasReconnect = this.reconnectAttempts > 0;
294
+ this.reconnectAttempts = 0;
295
+
296
+ // Collect any tasks still running on this machine so the server doesn't
297
+ // falsely declare them idle when it processes the reconnect.
298
+ const liveTaskIds = [];
299
+ for (const [taskId, taskInfo] of this.runningTasks.entries()) {
300
+ if (!taskInfo.cancelled) liveTaskIds.push(taskId);
301
+ }
302
+
303
+ // Register worker
304
+ this.send({
305
+ type: 'worker_register',
306
+ capabilities: {
307
+ maxAgents: 100, // API limits support 100+, real limit is local resources
308
+ openclawConfigured: true, // Uses OpenClaw with Claude Max (browser OAuth, no API key needed)
309
+ platform: process.platform,
310
+ arch: process.arch,
311
+ deviceName: homedir().split('/').pop() + '@' + hostname(),
312
+ hostname: hostname(),
313
+ anthropicApiKey: process.env.ANTHROPIC_API_KEY || this.cli?.anthropicApiKey || null
314
+ },
315
+ // Tasks still running locally — server should keep them active, not broadcast idle
316
+ liveTaskIds
317
+ });
318
+
319
+ // On reconnect, reset stale processing states and flush pending messages
320
+ if (wasReconnect) {
321
+ this.resetStaleProcessingStates();
322
+ setTimeout(() => {
323
+ this.flushPendingMessages();
324
+ this.processAllQueues(); // Kick-start any stalled queues
325
+ }, 500);
326
+ }
327
+
328
+ resolve();
329
+ });
330
+
331
+ this.ws.on('message', (data) => {
332
+ try {
333
+ const message = JSON.parse(data.toString());
334
+ this.handleMessage(message);
335
+ } catch (error) {
336
+ console.error('❌ Failed to parse message:', error);
337
+ }
338
+ });
339
+
340
+ this.ws.on('close', () => {
341
+ console.log('❌ Disconnected from AgentForge');
342
+ this.handleDisconnect();
343
+ });
344
+
345
+ this.ws.on('error', (error) => {
346
+ console.error('❌ WebSocket error:', error.message);
347
+ reject(error);
348
+ });
349
+ });
350
+ }
351
+
352
+ handleDisconnect() {
353
+ // Warn about running tasks that might have results queued
354
+ const runningCount = this.runningTasks.size;
355
+ const pendingCount = this.pendingMessages.length;
356
+ if (runningCount > 0 || pendingCount > 0) {
357
+ console.log(`âš ī¸ Disconnect with ${runningCount} running tasks, ${pendingCount} pending messages`);
358
+ }
359
+
360
+ if (this.reconnectAttempts < this.maxReconnectAttempts) {
361
+ this.reconnectAttempts++;
362
+ const delay = Math.min(1000 * Math.pow(2, this.reconnectAttempts - 1), 30000);
363
+ console.log(`🔄 Reconnecting in ${delay / 1000}s (attempt ${this.reconnectAttempts}/${this.maxReconnectAttempts})...`);
364
+ setTimeout(() => this.connect(), delay);
365
+ } else {
366
+ console.error('❌ Max reconnection attempts reached. Please restart the worker.');
367
+ process.exit(1);
368
+ }
369
+ }
370
+
371
+ async handleMessage(message) {
372
+ // Don't log pings — they flood the terminal every 30s with no useful info
373
+ if (message.type !== 'ping') {
374
+ console.log(`📨 Received: ${message.type}`);
375
+ }
376
+
377
+ switch (message.type) {
378
+ case 'task_assign':
379
+ // Inject API key from server if provided — ensures even workers without local key config work
380
+ if (message.anthropicApiKey) {
381
+ if (!process.env.ANTHROPIC_API_KEY) {
382
+ process.env.ANTHROPIC_API_KEY = message.anthropicApiKey;
383
+ console.log('🔑 Using API key from server task_assign');
384
+ }
385
+ if (this.cli && typeof this.cli === 'object' && 'anthropicApiKey' in this.cli) {
386
+ if (this.cli.anthropicApiKey !== message.anthropicApiKey) {
387
+ this.cli.anthropicApiKey = message.anthropicApiKey;
388
+ }
389
+ }
390
+ // Also wire key to hampagent
391
+ if (!this.hampagent.anthropicApiKey) {
392
+ this.hampagent.anthropicApiKey = message.anthropicApiKey;
393
+ }
394
+ }
395
+ await this.executeTask(message);
396
+ break;
397
+
398
+ case 'task_cancel':
399
+ // Support cancellation by taskId or agentId
400
+ console.log(`📨 CANCEL REQUEST: taskId=${message.taskId} agentId=${message.agentId}`);
401
+ if (message.taskId) {
402
+ await this.cancelTask(message.taskId);
403
+ } else if (message.agentId) {
404
+ await this.cancelTaskByAgent(message.agentId);
405
+ } else {
406
+ console.log(`âš ī¸ task_cancel received without taskId or agentId!`);
407
+ }
408
+ break;
409
+
410
+ case 'ping':
411
+ this.send({ type: 'pong' });
412
+ // Check for stuck agents - if processing but no activity for 2+ pings
413
+ this.checkForStuckAgents();
414
+ break;
415
+
416
+ case 'worker_restart':
417
+ console.log('🔄 Received worker_restart command — pulling latest code then restarting...');
418
+ this.send({ type: 'worker_restarting' });
419
+ setTimeout(() => this._updateAndRestart(), 200);
420
+ break;
421
+
422
+ case 'read_agent_files':
423
+ await this.handleReadAgentFiles(message);
424
+ break;
425
+
426
+ case 'write_agent_file':
427
+ await this.handleWriteAgentFile(message);
428
+ break;
429
+
430
+ case 'shell_exec': {
431
+ const { execId, command } = message;
432
+ console.log(`đŸ–Ĩī¸ shell_exec [${execId}]: ${command}`);
433
+ try {
434
+ const { execSync } = await import('child_process');
435
+ const output = execSync(command, { encoding: 'utf-8', timeout: 30000 });
436
+ this.send({ type: 'shell_exec_result', execId, success: true, output });
437
+ } catch (e) {
438
+ this.send({ type: 'shell_exec_result', execId, success: false, output: e.message });
439
+ }
440
+ break;
441
+ }
442
+
443
+ default:
444
+ console.log(`âš ī¸ Unknown message type: ${message.type}`);
445
+ }
446
+ }
447
+
448
+ async executeTask(taskData) {
449
+ const { agentId, taskId } = taskData;
450
+ console.log(`đŸ“Ŧ Task received: ${taskId} for agent ${agentId}`);
451
+
452
+ // Initialize queue for this agent if it doesn't exist
453
+ if (!this.agentQueues.has(agentId)) {
454
+ this.agentQueues.set(agentId, []);
455
+ this.agentProcessing.set(agentId, false);
456
+ }
457
+
458
+ // Add task to queue
459
+ const queue = this.agentQueues.get(agentId);
460
+ queue.push(taskData);
461
+
462
+ const queueLength = queue.length;
463
+ const isProcessing = this.agentProcessing.get(agentId);
464
+
465
+ if (queueLength > 1 || isProcessing) {
466
+ console.log(`đŸ“Ĩ Queued: ${taskId} (${queueLength} in queue, processing=${isProcessing})`);
467
+ }
468
+
469
+ // Start processing queue if not already processing
470
+ this.processQueue(agentId);
471
+ }
472
+
473
+ async processQueue(agentId) {
474
+ // If already processing, check if it's stale
475
+ if (this.agentProcessing.get(agentId)) {
476
+ const startTime = this.processingStartTime.get(agentId);
477
+ const elapsed = startTime ? Date.now() - startTime : Infinity;
478
+ const queueLen = this.agentQueues.get(agentId)?.length || 0;
479
+
480
+ if (elapsed > this.PROCESSING_TIMEOUT_MS) {
481
+ console.log(`âš ī¸ Agent ${agentId} stuck for ${Math.round(elapsed/1000)}s, forcing reset`);
482
+ this.agentProcessing.set(agentId, false);
483
+ this.processingStartTime.delete(agentId);
484
+ // Kill the stuck process
485
+ this.cli.cancelAgent(agentId);
486
+ } else {
487
+ // Log that we're waiting (every 30s to avoid spam)
488
+ if (elapsed > 0 && elapsed % 30000 < 1000 && queueLen > 0) {
489
+ console.log(`âŗ Agent ${agentId} busy (${Math.round(elapsed/1000)}s), ${queueLen} task(s) waiting`);
490
+ }
491
+ return; // Still processing, task will trigger next when done
492
+ }
493
+ }
494
+
495
+ const queue = this.agentQueues.get(agentId);
496
+ if (!queue || queue.length === 0) {
497
+ return; // Queue empty, nothing to do
498
+ }
499
+
500
+ // Mark as processing with timestamp
501
+ this.agentProcessing.set(agentId, true);
502
+ this.processingStartTime.set(agentId, Date.now());
503
+ console.log(`🚀 Starting task for ${agentId} (${queue.length} in queue)`);
504
+
505
+ // Get next task from queue
506
+ const taskData = queue.shift();
507
+
508
+ // Guard against undefined task data (can happen during reconnection race)
509
+ if (!taskData) {
510
+ console.warn(`âš ī¸ Queue shift returned undefined for agent ${agentId}, clearing processing flag`);
511
+ this.agentProcessing.set(agentId, false);
512
+ return;
513
+ }
514
+
515
+ // Wrap execution with a hard timeout that ALWAYS resolves
516
+ // This ensures processQueue ALWAYS completes and clears state
517
+ // Should fire AFTER the task hard timeout but BEFORE the task truly hangs forever
518
+ const QUEUE_TIMEOUT_MS = 31 * 60 * 1000; // 31 minutes (slightly longer than 30 min task timeout)
519
+
520
+ let executeTaskCompleted = false;
521
+ let queueTimeoutFired = false;
522
+ let queueTimeoutTimer = null;
523
+
524
+ try {
525
+ await Promise.race([
526
+ this.executeTaskNow(taskData).then(result => {
527
+ executeTaskCompleted = true;
528
+ // CRITICAL: Clear the queue timeout since task completed successfully
529
+ if (queueTimeoutTimer) {
530
+ clearTimeout(queueTimeoutTimer);
531
+ queueTimeoutTimer = null;
532
+ }
533
+ console.log(`[${agentId}] executeTaskNow completed normally`);
534
+ return result;
535
+ }).catch(error => {
536
+ executeTaskCompleted = true;
537
+ // CRITICAL: Clear the queue timeout since task completed (with error)
538
+ if (queueTimeoutTimer) {
539
+ clearTimeout(queueTimeoutTimer);
540
+ queueTimeoutTimer = null;
541
+ }
542
+ console.log(`[${agentId}] executeTaskNow rejected with error: ${error.message}`);
543
+ throw error;
544
+ }),
545
+ new Promise((resolve) => {
546
+ queueTimeoutTimer = setTimeout(() => {
547
+ // Don't fire if task already completed
548
+ if (executeTaskCompleted) {
549
+ console.log(`⏰ Queue timeout fired but task already completed, ignoring`);
550
+ return;
551
+ }
552
+
553
+ queueTimeoutFired = true;
554
+ console.log(`⏰ QUEUE TIMEOUT: Task ${taskData.taskId} exceeded ${QUEUE_TIMEOUT_MS/1000}s`);
555
+ console.log(`⏰ executeTaskCompleted: ${executeTaskCompleted}`);
556
+ console.log(`⏰ This indicates executeTaskNow Promise never settled - likely OpenClawCLI.runAgentTask hung`);
557
+
558
+ // Force kill the agent process
559
+ const killed = this.cli.cancelAgent(agentId);
560
+ console.log(`⏰ cancelAgent result: ${killed}`);
561
+
562
+ // Send debug report for queue timeout (different from task timeout)
563
+ const diagnostics = this.collectDiagnostics(
564
+ agentId,
565
+ taskData.taskId,
566
+ new Error(`Queue timeout - Promise hung for ${QUEUE_TIMEOUT_MS/1000}s after task should have completed`),
567
+ 'queue_timeout'
568
+ );
569
+ this.sendDebugReport(diagnostics, `QUEUE TIMEOUT: Task ${taskData.taskId} Promise never resolved. This usually means the OpenClaw process close event didn't fire. cancelAgent: ${killed}`);
570
+
571
+ resolve({ timeout: true });
572
+ }, QUEUE_TIMEOUT_MS);
573
+ })
574
+ ]);
575
+ } catch (error) {
576
+ console.error(`❌ Task execution error for ${taskData.taskId}:`, error);
577
+ } finally {
578
+ // Clear timeout if still pending
579
+ if (queueTimeoutTimer) {
580
+ clearTimeout(queueTimeoutTimer);
581
+ queueTimeoutTimer = null;
582
+ }
583
+ // Log which path we took
584
+ if (queueTimeoutFired && !executeTaskCompleted) {
585
+ console.log(`[${agentId}] âš ī¸ Queue timeout won the race - executeTaskNow never completed`);
586
+ }
587
+ // ALWAYS clear processing state - this is critical
588
+ console.log(`🧹 Clearing processing state for ${agentId}`);
589
+ this.agentProcessing.set(agentId, false);
590
+ this.processingStartTime.delete(agentId);
591
+
592
+ // Process next task if queue is not empty
593
+ if (queue.length > 0) {
594
+ if (queueTimeoutFired) {
595
+ // After a force-kill, wait 3s for the dead process to release openclaw's workspace
596
+ // locks before starting the next task — avoids "Agent failed: Unknown error" on restart
597
+ console.log(`📤 Waiting 3s for killed process cleanup before next task for ${agentId}...`);
598
+ setTimeout(() => this.processQueue(agentId), 3000);
599
+ } else {
600
+ console.log(`📤 Processing next queued task for ${agentId} (${queue.length} remaining)...`);
601
+ setImmediate(() => this.processQueue(agentId));
602
+ }
603
+ }
604
+ }
605
+ }
606
+
607
+ async executeTaskNow({ taskId, agentId, sessionId, message: userMessage, workDir, defaultProjectsPath, image, roomId, roomContext, isMaestro, conversationHistory, browserProfile, agentName, agentEmoji, runnerType }) {
608
+ const isMaestroTask = isMaestro || agentId === 'maestro';
609
+ console.log(`🤖 Executing task ${taskId} for agent ${agentId}${isMaestroTask ? ' (MAESTRO)' : ''}${browserProfile ? ` [browser: ${browserProfile}]` : ''}`);
610
+ if (sessionId) {
611
+ console.log(` Session: ${sessionId} (resuming conversation)`);
612
+ }
613
+ if (image) {
614
+ console.log(` Image: Provided`);
615
+ }
616
+ if (roomId) {
617
+ console.log(` Room: ${roomId}`);
618
+ }
619
+
620
+ // Track this running task for cancellation support
621
+ this.runningTasks.set(taskId, { agentId, cancelled: false, isMaestro: isMaestroTask });
622
+
623
+ // Hard timeout for entire task - 30 minutes max (large projects with Opus can be slow)
624
+ const TASK_TIMEOUT_MS = 30 * 60 * 1000; // 30 minutes hard timeout
625
+ console.log(`âąī¸ Task ${taskId} timeout set for ${TASK_TIMEOUT_MS/1000}s`);
626
+ const taskTimeoutTimer = setTimeout(() => {
627
+ console.log(`⏰ TIMEOUT FIRED for ${taskId}`);
628
+ const taskInfo = this.runningTasks.get(taskId);
629
+ console.log(`⏰ taskInfo: ${JSON.stringify(taskInfo)}`);
630
+ if (taskInfo && !taskInfo.cancelled) {
631
+ console.log(`⏰ Task ${taskId} hit hard timeout (${TASK_TIMEOUT_MS/1000}s), force killing`);
632
+ const killed = this.cli.cancelAgent(agentId);
633
+ console.log(`⏰ cancelAgent returned: ${killed}`);
634
+
635
+ // Collect diagnostics for debug agent BEFORE cleanup
636
+ const diagnostics = this.collectDiagnostics(
637
+ agentId,
638
+ taskId,
639
+ new Error(`Task exceeded ${TASK_TIMEOUT_MS/1000}s hard timeout`),
640
+ 'timeout'
641
+ );
642
+
643
+ // CRITICAL: Reset processing state so queue can continue
644
+ // even if executeTaskNow is still hanging
645
+ this.agentProcessing.set(agentId, false);
646
+ this.processingStartTime.delete(agentId);
647
+ this.runningTasks.delete(taskId);
648
+ console.log(`⏰ Reset processing state for ${agentId}`);
649
+ taskInfo.cancelled = true;
650
+ this.send({
651
+ type: 'task_failed',
652
+ taskId,
653
+ agentId,
654
+ roomId,
655
+ error: `Task timed out after ${TASK_TIMEOUT_MS/1000} seconds`
656
+ });
657
+
658
+ // Send debug report so debug-agent-system can investigate
659
+ this.sendDebugReport(diagnostics, `Task ${taskId} for agent ${agentId} hit hard timeout (${TASK_TIMEOUT_MS/1000}s). cancelAgent returned: ${killed}`);
660
+ }
661
+ }, TASK_TIMEOUT_MS);
662
+
663
+ // Closure set once wrapped handlers are created; callable from both try and catch paths
664
+ let _cleanup = null;
665
+
666
+ try {
667
+ // Agent workspace: always the dedicated /tmp folder for this agent
668
+ // This is where MEMORY.md, AGENTS.md, .canary, identity files live - never pollute user's projects folder
669
+ const agentWorkspaceDir = workDir || process.cwd();
670
+
671
+ // Task cwd: where the agent actually runs commands and reads/writes files
672
+ // If user configured a defaultProjectsPath in settings AND it exists locally, use that
673
+ // so the agent can see all the user's projects without needing to be told where they are
674
+ let taskCwd = agentWorkspaceDir;
675
+ if (defaultProjectsPath && defaultProjectsPath.trim()) {
676
+ const { existsSync } = await import('fs');
677
+ if (existsSync(defaultProjectsPath)) {
678
+ taskCwd = defaultProjectsPath;
679
+ console.log(`📁 Using configured projects path as task cwd: ${taskCwd}`);
680
+ } else {
681
+ console.log(`âš ī¸ Configured projects path not found locally (${defaultProjectsPath}), using default cwd: ${taskCwd}`);
682
+ }
683
+ }
684
+ // Keep actualWorkDir pointing to the agent's workspace for backward compat
685
+ const actualWorkDir = agentWorkspaceDir;
686
+
687
+ // Select runner — hampagent or openclaw
688
+ const useHampagent = runnerType === 'hampagent';
689
+ const activeRunner = useHampagent ? this.hampagent : this.cli;
690
+
691
+ if (useHampagent) {
692
+ // Pass identity so system prompt knows the agent's name/emoji
693
+ this.hampagent._agentName = agentName || null;
694
+ this.hampagent._agentEmoji = agentEmoji || null;
695
+ console.log(`đŸĻ… Using Hampagent runner for ${agentId}`);
696
+ } else {
697
+ // Guard against bloated OpenClaw sessions — trim tail, preserve recent context
698
+ this._guardOpenClawSession(agentId);
699
+ }
700
+
701
+ // Create agent if not exists (always in its dedicated workspace, not the projects folder)
702
+ if (!this.activeAgents.has(agentId)) {
703
+ console.log(`đŸ“Ļ Creating agent ${agentId} workspace in ${actualWorkDir}`);
704
+ await activeRunner.createAgent(agentId, actualWorkDir);
705
+ this.activeAgents.set(agentId, { workDir: actualWorkDir, taskCwd });
706
+ }
707
+
708
+ // Set up output streaming
709
+ const outputHandler = (data) => {
710
+ if (data.agentId === agentId) {
711
+ // Check if task was cancelled - don't send any more output
712
+ const taskInfo = this.runningTasks.get(taskId);
713
+ if (!taskInfo || taskInfo.cancelled) {
714
+ return; // Task cancelled, drop all output
715
+ }
716
+
717
+ // Record activity to prevent stuck detection from firing
718
+ this.recordAgentActivity(agentId);
719
+
720
+ // Filter out tool error stack traces from room chat
721
+ const text = data.output?.trim();
722
+ if (text && roomId) {
723
+ // Don't broadcast internal tool errors to room chat
724
+ const isToolError = text.includes('tools:') && text.includes('failed stack:') ||
725
+ text.includes('Error:') && text.includes('at file:///') ||
726
+ text.includes('at async') && text.includes('.js:');
727
+ if (isToolError) {
728
+ console.log(` [Filtered tool error from room broadcast]`);
729
+ return;
730
+ }
731
+ }
732
+
733
+ // Send maestro-specific output for maestro tasks
734
+ if (isMaestroTask) {
735
+ // Detect when maestro spawns agents
736
+ const spawnMatch = text.match(/sessions_spawn\s+--id\s+([^\s]+)\s+--message\s+"([^"]+)"/);
737
+ if (spawnMatch) {
738
+ const spawnedAgentId = spawnMatch[1];
739
+ const spawnedTask = spawnMatch[2];
740
+ console.log(`đŸŽŧ Maestro spawned agent: ${spawnedAgentId}`);
741
+
742
+ // Emit agent spawned event
743
+ this.send({
744
+ type: 'maestro_agent_spawned',
745
+ taskId,
746
+ agentId: spawnedAgentId,
747
+ task: spawnedTask
748
+ });
749
+ }
750
+
751
+ this.send({
752
+ type: 'maestro_output',
753
+ taskId,
754
+ agentId,
755
+ output: data.output
756
+ });
757
+ } else {
758
+ this.send({
759
+ type: 'task_progress',
760
+ taskId,
761
+ agentId,
762
+ roomId,
763
+ output: data.output,
764
+ isChunk: true
765
+ });
766
+ }
767
+ }
768
+ };
769
+
770
+ // Handle agent_image events — screenshot the agent wants to send to the user's chat
771
+ const imageHandler = (data) => {
772
+ if (data.agentId === agentId) {
773
+ const taskInfo = this.runningTasks.get(taskId);
774
+ if (!taskInfo || taskInfo.cancelled) return;
775
+ this.send({
776
+ type: 'task_progress',
777
+ taskId,
778
+ agentId,
779
+ roomId,
780
+ output: '',
781
+ image: data.image
782
+ });
783
+ }
784
+ };
785
+ activeRunner.on('agent_image', imageHandler);
786
+
787
+ // Set up error streaming (stderr)
788
+ const errorHandler = (data) => {
789
+ if (data.agentId === agentId) {
790
+ // Check if task was cancelled - don't send any more errors
791
+ const taskInfo = this.runningTasks.get(taskId);
792
+ if (!taskInfo || taskInfo.cancelled) {
793
+ return; // Task cancelled, drop all output
794
+ }
795
+
796
+ // Filter out common non-critical stderr noise
797
+ const text = data.error?.trim();
798
+ if (!text) return;
799
+
800
+ const isNoise = text.includes('[Canary]') ||
801
+ text.includes('Plugin registered') ||
802
+ text.includes('browser/service') ||
803
+ text.includes('Debugger listening') ||
804
+ text.includes('EADDRINUSE') ||
805
+ text.includes('[openclaw]') ||
806
+ text.includes('Unhandled promise rejection') ||
807
+ text.includes('shared_storage_worklet') ||
808
+ text.includes('playwright-core') ||
809
+ text.includes('targetInfo') ||
810
+ text.includes('[diagnostic]') ||
811
+ text.includes('[stderr]') ||
812
+ text.includes('crBrowser') ||
813
+ text.includes('crConnection');
814
+
815
+ if (!isNoise) {
816
+ this.send({
817
+ type: 'task_error',
818
+ taskId,
819
+ agentId,
820
+ roomId,
821
+ error: text
822
+ });
823
+ }
824
+ }
825
+ };
826
+
827
+ // Set up tool activity streaming (shows what tool agent is using)
828
+ const toolActivityHandler = (data) => {
829
+ if (data.agentId === agentId) {
830
+ // Record activity to prevent stuck detection from firing
831
+ this.recordAgentActivity(agentId);
832
+
833
+ // tts tool is handled natively by openclaw — do not intercept or emit anything
834
+
835
+ let toolInputPreview;
836
+ if (data.toolInput) {
837
+ try {
838
+ toolInputPreview = JSON.stringify(data.toolInput).slice(0, 2000);
839
+ } catch {
840
+ toolInputPreview = undefined;
841
+ }
842
+ }
843
+
844
+ this.send({
845
+ type: 'tool_activity',
846
+ taskId,
847
+ agentId,
848
+ roomId,
849
+ event: data.event,
850
+ tool: data.tool,
851
+ description: data.description,
852
+ input: toolInputPreview
853
+ });
854
+ }
855
+ };
856
+
857
+ activeRunner.on('agent_output', outputHandler);
858
+ activeRunner.on('agent_error', errorHandler);
859
+ activeRunner.on('tool_activity', toolActivityHandler);
860
+
861
+ // Listen for raw alive signals (any stdout, even filtered) to prevent false stuck detection
862
+ const aliveHandler = (data) => {
863
+ if (data.agentId === agentId) {
864
+ this.recordAgentActivity(agentId);
865
+ }
866
+ };
867
+ activeRunner.on('agent_alive', aliveHandler);
868
+
869
+ // Inactivity: warn at 60s, KILL at 10 minutes of silence (no stdout at all)
870
+ const INACTIVITY_WARN_MS = 60000;
871
+ const INACTIVITY_KILL_MS = 2 * 60 * 1000; // 2 minutes — kill truly hung openclaw
872
+ let lastActivityTime = Date.now();
873
+ let inactivityTimer = null;
874
+ let inactivityKillTimer = null;
875
+ let currentTool = null; // Track which tool is currently running
876
+ let promiseSettled = false; // Prevent kill timer from firing after task completes
877
+
878
+ // Tools that are expected to take a while - don't warn about these
879
+ const QUIET_TOOLS = ['Editing file', 'Writing file', 'Reading file', 'edit', 'write', 'read'];
880
+
881
+ const clearInactivityTimers = () => {
882
+ if (inactivityTimer) { clearTimeout(inactivityTimer); inactivityTimer = null; }
883
+ if (inactivityKillTimer) { clearTimeout(inactivityKillTimer); inactivityKillTimer = null; }
884
+ };
885
+
886
+ const resetInactivityTimer = () => {
887
+ lastActivityTime = Date.now();
888
+ // Also reset the ping-based stuck detector so 300s kill doesn't fire during active work
889
+ this.lastAgentActivity.set(agentId, Date.now());
890
+ this.pingsSinceActivity.set(agentId, 0);
891
+ clearInactivityTimers();
892
+
893
+ // Warn at 30s
894
+ inactivityTimer = setTimeout(() => {
895
+ const taskInfo = this.runningTasks.get(taskId);
896
+ if (taskInfo && !taskInfo.cancelled) {
897
+ if (currentTool && QUIET_TOOLS.some(t => currentTool.toLowerCase().includes(t.toLowerCase()))) {
898
+ resetInactivityTimer(); // quiet tool — just reset and check again later
899
+ return;
900
+ }
901
+ const stuckTool = currentTool ? ` while running "${currentTool}"` : '';
902
+ this.send({
903
+ type: 'task_warning',
904
+ taskId,
905
+ agentId,
906
+ roomId,
907
+ warning: `No activity for ${INACTIVITY_WARN_MS/1000} seconds${stuckTool} - agent may be stuck`,
908
+ lastTool: currentTool
909
+ });
910
+ }
911
+ }, INACTIVITY_WARN_MS);
912
+
913
+ // Kill at 10 minutes — openclaw is definitely hung
914
+ inactivityKillTimer = setTimeout(() => {
915
+ if (promiseSettled) return;
916
+ const taskInfo = this.runningTasks.get(taskId);
917
+ if (taskInfo && !taskInfo.cancelled) {
918
+ console.warn(`[${agentId}] âš ī¸ No output for ${INACTIVITY_KILL_MS/1000}s — openclaw hung mid-task, killing`);
919
+ promiseSettled = true;
920
+ // cancelAgent kills the process tree; OpenClawCLI's close handler will reject runAgentTask
921
+ activeRunner.cancelAgent(agentId);
922
+ }
923
+ }, INACTIVITY_KILL_MS);
924
+ };
925
+
926
+ // Track tool lifecycle
927
+ const toolLifecycleHandler = (data) => {
928
+ if (data.agentId === agentId && data.event) {
929
+ if (data.event === 'start') {
930
+ currentTool = data.description || data.tool;
931
+ } else if (data.event === 'complete') {
932
+ currentTool = null;
933
+ }
934
+ }
935
+ };
936
+ activeRunner.on('tool_activity', toolLifecycleHandler);
937
+
938
+ // Reset timer on any activity
939
+ const activityWrapper = (handler) => (data) => {
940
+ resetInactivityTimer();
941
+ handler(data);
942
+ };
943
+
944
+ // Wrap handlers to track activity
945
+ const wrappedOutputHandler = activityWrapper(outputHandler);
946
+ const wrappedToolHandler = activityWrapper(toolActivityHandler);
947
+ const wrappedAliveHandler = activityWrapper(aliveHandler);
948
+
949
+ activeRunner.off('agent_output', outputHandler);
950
+ activeRunner.off('tool_activity', toolActivityHandler);
951
+ activeRunner.off('agent_alive', aliveHandler);
952
+ activeRunner.on('agent_output', wrappedOutputHandler);
953
+ activeRunner.on('tool_activity', wrappedToolHandler);
954
+ activeRunner.on('agent_alive', wrappedAliveHandler);
955
+
956
+ // Capture cleanup as a callable closure so both try and catch paths can use it
957
+ _cleanup = () => {
958
+ promiseSettled = true; // Prevent inactivity kill timer from firing after task ends
959
+ clearInactivityTimers();
960
+ activeRunner.off('agent_output', wrappedOutputHandler);
961
+ activeRunner.off('agent_error', errorHandler);
962
+ activeRunner.off('tool_activity', wrappedToolHandler);
963
+ activeRunner.off('tool_activity', toolLifecycleHandler);
964
+ activeRunner.off('agent_alive', wrappedAliveHandler);
965
+ activeRunner.off('agent_image', imageHandler);
966
+ };
967
+
968
+ resetInactivityTimer(); // Start the timer
969
+
970
+ // Build message with room context if in a room
971
+ let finalMessage = userMessage;
972
+ if (roomContext) {
973
+ // Prepend room context to message
974
+ const contextInfo = [
975
+ `You are in a multi-agent room: ${roomContext.roomName}`,
976
+ `Participants: ${roomContext.participants.map(p => `${p.emoji} ${p.name}${p.isYou ? ' (you)' : ''}`).join(', ')}`,
977
+ roomContext.instructions,
978
+ '',
979
+ `User message: ${userMessage}`
980
+ ].join('\n');
981
+ finalMessage = contextInfo;
982
+ }
983
+
984
+ // Inject platform context into EVERY message so the agent always knows:
985
+ // 1. What platform it's running on and its URL
986
+ // 2. Where the user's projects folder is
987
+ // 3. Screenshot capabilities
988
+ const platformContext = [
989
+ `[System context:`,
990
+ `- Platform: AgentForge.ai. Dashboard: https://agentforgeai-production.up.railway.app/dashboard. CRITICAL: Always use the built-in 'browser' tool for ALL web browsing AND web searches — NEVER use the 'web_search' tool (no API keys are configured), NEVER run shell commands like 'open', 'google-chrome', 'chromium', or any OS command to launch a browser. The browser tool connects to AgentForge Browser (port 9223) automatically. To search: use browser to navigate to google.com or perplexity.ai.`,
991
+ `- Your runner: ${useHampagent ? 'Hampagent' : 'OpenClaw'}.`,
992
+ (!conversationHistory || conversationHistory.length === 0)
993
+ ? `- This is the first message. When greeting, say: "I'm [your name] — your ${useHampagent ? 'Hampagent' : 'OpenClaw'} agent running on AgentForge." Never say "autonomous AI agent". Never list capabilities in an intro.`
994
+ : `- This is a continuing conversation. Do NOT re-introduce yourself.`,
995
+ agentName
996
+ ? `- Your name is "${agentName}"${agentEmoji ? ` ${agentEmoji}` : ''}. This is your AgentForge identity. Do not ask the user who you are or what your name is — you already know.`
997
+ : null,
998
+ taskCwd && taskCwd !== agentWorkspaceDir
999
+ ? `- Working directory: "${taskCwd}" — user's projects folder. Check here first for any project by name.`
1000
+ : null,
1001
+ agentWorkspaceDir
1002
+ ? `- Screenshots: screencapture -x ${agentWorkspaceDir}/ss1.png && sips -Z 1280 ${agentWorkspaceDir}/ss1.png (MUST resize — API rejects images over 5MB). Send to chat with: echo "AGENTFORGE_IMAGE:${agentWorkspaceDir}/ss1.png". Always screenshot visual work before saying done. NEVER use "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --headless for screenshots — use screencapture only.`
1003
+ : `- Screenshots: screencapture -x /tmp/ss1.png && sips -Z 1280 /tmp/ss1.png (MUST resize — API rejects images over 5MB). Send to chat with: echo "AGENTFORGE_IMAGE:/tmp/ss1.png". Always screenshot visual work before saying done. NEVER use "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --headless for screenshots — use screencapture only.`,
1004
+ `]`
1005
+ ].filter(Boolean).join('\n');
1006
+ finalMessage = platformContext + '\n\n' + finalMessage;
1007
+
1008
+ // If conversation history was loaded from DB (e.g. session expired, worker restarted,
1009
+ // or user returning hours later), prepend it so the agent has full context.
1010
+ // Only do this for non-room tasks and when we actually have history.
1011
+ if (!roomContext && conversationHistory && conversationHistory.length > 0) {
1012
+ // When the gateway is configured, openclaw maintains session state natively via the
1013
+ // x-openclaw-session-key header — no need to inject history manually.
1014
+ // Only inject history when gateway is unavailable (subprocess fallback).
1015
+ // Hampagent manages its own session history natively — never inject DB history for it
1016
+ const gatewayActive = !!(this.cli.gatewayPort && this.cli.gatewayToken);
1017
+ const sessionExists = useHampagent || gatewayActive;
1018
+
1019
+ if (!sessionExists) {
1020
+ // Session is gone — inject DB history as context prefix so the agent remembers
1021
+ console.log(`📚 Session workspace not found for ${agentId}, injecting ${conversationHistory.length} messages from DB history`);
1022
+ // Strip tool errors and stack traces from stored assistant messages before injecting
1023
+ const stripHistoryNoise = (text) => {
1024
+ if (!text) return text;
1025
+ return text.split('\n').filter(line => {
1026
+ const t = line.trim();
1027
+ return !(
1028
+ t.startsWith('tools:') ||
1029
+ t.includes('failed stack:') ||
1030
+ /^\s*at\s+\S/.test(line) ||
1031
+ t.startsWith('at file:///') || t.startsWith('at async ') || t.startsWith('at Object.')
1032
+ );
1033
+ }).join('\n').trim();
1034
+ };
1035
+ const stripSystemContext = (text) => {
1036
+ if (!text) return text;
1037
+ // Remove stale [System context: ...] blocks injected by older worker builds
1038
+ // (e.g. old Chrome screenshot instructions). Current context is always re-injected fresh.
1039
+ return text.replace(/\[System context:[\s\S]*?\n\]/g, '').trim();
1040
+ };
1041
+ const historyText = conversationHistory
1042
+ .slice(-5) // last 5 messages — keep context small to prevent API hangs
1043
+ .map(msg => {
1044
+ const role = msg.role === 'user' ? 'User' : 'Assistant';
1045
+ const content = msg.role === 'user'
1046
+ ? stripSystemContext(msg.content)
1047
+ : stripHistoryNoise(msg.content);
1048
+ return `${role}: ${content}`;
1049
+ })
1050
+ .join('\n\n');
1051
+ // Prepend history to finalMessage (which already contains platform context + userMessage)
1052
+ finalMessage = `[Conversation history — you are resuming a prior session]\n\n${historyText}\n\n[End of history]\n\n${finalMessage}`;
1053
+ } else {
1054
+ console.log(`📚 Session workspace found for ${agentId}, openclaw will use its own session memory`);
1055
+ }
1056
+ }
1057
+
1058
+ // ── Skills: load matching skill files from agent workspace ──────────────
1059
+ // Skills are .md files in {workspace}/skills/ with YAML frontmatter.
1060
+ // Matching ones are prepended so the agent has relevant context upfront.
1061
+ if (!isMaestroTask && !roomContext) {
1062
+ const skillContext = this.loadMatchingSkills(actualWorkDir, userMessage);
1063
+ if (skillContext) {
1064
+ console.log(`[${taskId}] 📚 Injecting matching skill(s)`);
1065
+ finalMessage = `[Relevant skill from your skills library]\n${skillContext}\n\n---\n\n${finalMessage}`;
1066
+ }
1067
+
1068
+ // No verification protocol — it adds tokens and causes openclaw to hang on large contexts
1069
+ }
1070
+
1071
+ // ── Iteration loop: run up to 3 times, refining based on agent feedback ─
1072
+ const MAX_ITERATIONS = isMaestroTask || roomContext ? 1 : 3;
1073
+ let iteration = 0;
1074
+ let taskResult;
1075
+ let iterationMessage = finalMessage;
1076
+
1077
+ while (iteration < MAX_ITERATIONS) {
1078
+ iteration++;
1079
+
1080
+ if (iteration > 1) {
1081
+ console.log(`[${taskId}] 🔄 Iteration ${iteration}/${MAX_ITERATIONS}`);
1082
+ this.send({
1083
+ type: 'task_iteration',
1084
+ taskId,
1085
+ agentId,
1086
+ roomId,
1087
+ iteration,
1088
+ maxIterations: MAX_ITERATIONS
1089
+ });
1090
+ }
1091
+
1092
+ console.log(`[${taskId}] 🏃 Runner: ${useHampagent ? '⚡ HAMPAGENT' : '🔧 OPENCLAW'} — agent ${agentId} iteration ${iteration}`);
1093
+ const runAgentStart = Date.now();
1094
+ taskResult = await activeRunner.runAgentTask(
1095
+ agentId, iterationMessage, taskCwd, sessionId, iteration === 1 ? image : null, browserProfile, actualWorkDir
1096
+ );
1097
+ const runAgentDuration = Date.now() - runAgentStart;
1098
+ console.log(`[${taskId}] runAgentTask iteration ${iteration} returned after ${runAgentDuration}ms, success=${taskResult?.success}`);
1099
+
1100
+ const output = taskResult?.result?.output || '';
1101
+
1102
+ if (output.includes('✓ TASK_COMPLETE')) {
1103
+ if (iteration > 1) {
1104
+ // Save a skill stub so future similar tasks start better
1105
+ this.saveIteratedSkill(agentId, actualWorkDir, userMessage, iteration).catch(() => {});
1106
+ this.send({
1107
+ type: 'task_skill_saved',
1108
+ taskId,
1109
+ agentId,
1110
+ roomId,
1111
+ iterations: iteration
1112
+ });
1113
+ }
1114
+ break;
1115
+ }
1116
+
1117
+ // Check for iterate signal — if present and iterations remain, continue
1118
+ const iterateMatch = output.match(/â†ģ ITERATE:\s*(.+?)(?:\n|$)/);
1119
+ if (iterateMatch && iteration < MAX_ITERATIONS) {
1120
+ const feedback = iterateMatch[1].trim();
1121
+ console.log(`[${taskId}] Agent self-correcting: ${feedback}`);
1122
+ iterationMessage = `Your previous attempt had this issue: "${feedback}"\n\nPlease revise your work to fix it. When done, end with ✓ TASK_COMPLETE or â†ģ ITERATE: [remaining issue].`;
1123
+ } else {
1124
+ break; // No iterate signal or max iterations reached
1125
+ }
1126
+ }
1127
+
1128
+ // Clean up listeners and timer
1129
+ if (_cleanup) { _cleanup(); _cleanup = null; }
1130
+
1131
+ // Check if task was cancelled or already handled by timeout
1132
+ const taskInfo = this.runningTasks.get(taskId);
1133
+ if (!taskInfo || taskInfo.cancelled) {
1134
+ console.log(`âš ī¸ Task ${taskId} was already handled (cancelled/timeout), skipping`);
1135
+ return;
1136
+ }
1137
+
1138
+ // Use AgentForge-assigned identity when provided (from task_assign).
1139
+ // Fall back to querying OpenClaw only when no name was supplied.
1140
+ let identity = { identityName: agentId, identityEmoji: '🤖' };
1141
+ if (agentName) {
1142
+ identity = { identityName: agentName, identityEmoji: agentEmoji || '🤖' };
1143
+ console.log(`[${taskId}] Using AgentForge identity: ${identity.identityName}`);
1144
+ } else {
1145
+ console.log(`[${taskId}] Getting agent identity from OpenClaw...`);
1146
+ const identityStart = Date.now();
1147
+ try {
1148
+ const identityPromise = this.cli.getAgentIdentity(agentId);
1149
+ const timeoutPromise = new Promise(r => setTimeout(() => r(null), 5000)); // 5s max
1150
+ const result = await Promise.race([identityPromise, timeoutPromise]);
1151
+ if (result) {
1152
+ identity = result;
1153
+ } else {
1154
+ console.log(`[${taskId}] âš ī¸ getAgentIdentity timed out after ${Date.now() - identityStart}ms`);
1155
+ }
1156
+ } catch (e) {
1157
+ console.log(`[${taskId}] âš ī¸ getAgentIdentity failed after ${Date.now() - identityStart}ms: ${e.message}`);
1158
+ }
1159
+ console.log(`[${taskId}] Got identity in ${Date.now() - identityStart}ms: ${identity.identityName}`);
1160
+ }
1161
+
1162
+ // Send completion with identity info, final response text, and sessionId for maestro
1163
+ // Filter openclaw's "No reply from agent." placeholder — it appears when the agent only
1164
+ // used tools with no text response (e.g. TTS-only tasks). If we send it, the browser's
1165
+ // filterAgentMarkers will strip it, leaving a blank bubble. Instead, leave response
1166
+ // undefined so the server falls back to the accumulated task_progress text (e.g. "🔊 Spoke aloud: ...").
1167
+ const rawOutput = taskResult?.result?.output || '';
1168
+ console.log(`[${taskId}] 🔍 taskResult.result.output (${rawOutput.length} chars): "${rawOutput.slice(0, 200)}"`);
1169
+ let finalOutput = rawOutput.trim();
1170
+ if (/^no reply from agent\.?$/i.test(finalOutput)) {
1171
+ console.log(`[${taskId}] 🔕 Filtered "No reply from agent." from finalOutput`);
1172
+ finalOutput = '';
1173
+ }
1174
+ // If the output is just an error stack trace / tool failure message, clear it
1175
+ // so the server falls back to accumulated task_progress text instead
1176
+ if (finalOutput &&
1177
+ (finalOutput.startsWith('tools:') || finalOutput.startsWith('Error:') ||
1178
+ finalOutput.includes('failed stack:') || finalOutput.includes('at file:///') ||
1179
+ /^\s*at\s+/.test(finalOutput))) {
1180
+ console.log(`[${taskId}] 🔕 Filtered error-only output from finalOutput: "${finalOutput.slice(0,100)}"`);
1181
+ finalOutput = '';
1182
+ }
1183
+ // If the task succeeded but produced no text, emit a minimal completion token
1184
+ // so the browser never shows "No response received" for a successful task.
1185
+ if (!finalOutput && taskResult?.success) {
1186
+ finalOutput = '✓ Done.';
1187
+ console.log(`[${taskId}] â„šī¸ Task succeeded with no text output — using default completion message`);
1188
+ }
1189
+ console.log(`[${taskId}] 📤 finalOutput="${finalOutput.slice(0,100)}" response=${finalOutput ? `"${finalOutput.slice(0,80)}"` : 'undefined'}`);
1190
+ const completionMessage = {
1191
+ type: 'task_complete',
1192
+ taskId,
1193
+ agentId,
1194
+ roomId,
1195
+ identity,
1196
+ response: finalOutput || undefined
1197
+ };
1198
+
1199
+ // Include sessionId for maestro to maintain conversation
1200
+ if (isMaestroTask && sessionId) {
1201
+ completionMessage.sessionId = sessionId;
1202
+ }
1203
+
1204
+ console.log(`[${taskId}] Sending completion message...`);
1205
+ this.send(completionMessage);
1206
+
1207
+ console.log(`✅ Task ${taskId} completed (${identity.identityName})`);
1208
+
1209
+ // Clear hard timeout and clean up
1210
+ console.log(`[${taskId}] Clearing timeout and task tracking...`);
1211
+ clearTimeout(taskTimeoutTimer);
1212
+ this.runningTasks.delete(taskId);
1213
+ console.log(`[${taskId}] executeTaskNow DONE - returning normally`);
1214
+ } catch (error) {
1215
+ // Always clean up listeners — this path was previously missing cleanup,
1216
+ // causing stacked listeners across multiple task runs
1217
+ if (_cleanup) { _cleanup(); _cleanup = null; }
1218
+
1219
+ // Clear hard timeout
1220
+ clearTimeout(taskTimeoutTimer);
1221
+
1222
+ // Check if this was a cancellation or timeout
1223
+ // If taskInfo is missing, timeout already handled it
1224
+ const taskInfo = this.runningTasks.get(taskId);
1225
+ if (!taskInfo || taskInfo.cancelled) {
1226
+ console.log(`âš ī¸ Task ${taskId} was already cancelled/timed out, ignoring error`);
1227
+ return;
1228
+ }
1229
+
1230
+ console.error(`❌ Task ${taskId} failed:`, error);
1231
+
1232
+ // Collect detailed diagnostics
1233
+ const diagnostics = this.collectDiagnostics(agentId, taskId, error, 'error');
1234
+
1235
+ // Kill the agent process — try both runners (only one will be active)
1236
+ this.cli.cancelAgent(agentId);
1237
+ this.hampagent?.cancelAgent(agentId);
1238
+
1239
+ this.send({
1240
+ type: 'task_failed',
1241
+ taskId,
1242
+ agentId,
1243
+ roomId,
1244
+ error: error.message,
1245
+ stack: error.stack
1246
+ });
1247
+
1248
+ // Send debug report for investigation
1249
+ this.sendDebugReport(diagnostics, `Agent ${agentId} failed during task ${taskId}: ${error.message}`);
1250
+
1251
+ // Clean up task tracking
1252
+ this.runningTasks.delete(taskId);
1253
+ }
1254
+ }
1255
+
1256
+ async cancelTask(taskId) {
1257
+ console.log(`🛑 CANCEL RECEIVED for task ${taskId}`);
1258
+
1259
+ const taskInfo = this.runningTasks.get(taskId);
1260
+ if (!taskInfo) {
1261
+ console.log(`âš ī¸ Task ${taskId} not found in runningTasks (size: ${this.runningTasks.size})`);
1262
+ console.log(` runningTasks keys: ${[...this.runningTasks.keys()].join(', ')}`);
1263
+ // Still send cancelled message to browser to clear UI state
1264
+ this.send({
1265
+ type: 'task_cancelled',
1266
+ taskId
1267
+ });
1268
+ return;
1269
+ }
1270
+
1271
+ const { agentId } = taskInfo;
1272
+ console.log(`🛑 Found task for agent ${agentId}, cancelling...`);
1273
+
1274
+ // Mark as cancelled so completion handler knows
1275
+ taskInfo.cancelled = true;
1276
+
1277
+ // Kill the running process — try both runners (only one will be active)
1278
+ console.log(`🛑 Calling cancelAgent for ${agentId}...`);
1279
+ const killed = this.cli.cancelAgent(agentId) || this.hampagent?.cancelAgent(agentId) || false;
1280
+ console.log(`🛑 cancelAgent returned: ${killed}`);
1281
+
1282
+ // Always clear state and notify browser, regardless of kill success
1283
+ this.agentQueues.set(agentId, []);
1284
+ this.agentProcessing.set(agentId, false);
1285
+ this.processingStartTime.delete(agentId);
1286
+
1287
+ // Notify the server that the task was cancelled
1288
+ this.send({
1289
+ type: 'task_cancelled',
1290
+ taskId,
1291
+ agentId
1292
+ });
1293
+
1294
+ console.log(`✅ Task ${taskId} cancelled, kill result: ${killed}`);
1295
+
1296
+ // Clean up
1297
+ this.runningTasks.delete(taskId);
1298
+ }
1299
+
1300
+ async cancelTaskByAgent(agentId) {
1301
+ console.log(`🛑 Cancelling task for agent ${agentId}`);
1302
+
1303
+ // Find the running task for this agent
1304
+ let taskId = null;
1305
+ for (const [tid, info] of this.runningTasks.entries()) {
1306
+ if (info.agentId === agentId) {
1307
+ taskId = tid;
1308
+ break;
1309
+ }
1310
+ }
1311
+
1312
+ if (taskId) {
1313
+ await this.cancelTask(taskId);
1314
+ } else {
1315
+ // Still try to kill the process even if we don't have a taskId — try both runners
1316
+ const killed = this.cli.cancelAgent(agentId) || this.hampagent?.cancelAgent(agentId) || false;
1317
+
1318
+ // Always clear the queue and state for this agent
1319
+ this.agentQueues.set(agentId, []);
1320
+ this.agentProcessing.set(agentId, false);
1321
+ this.processingStartTime.delete(agentId);
1322
+
1323
+ if (killed) {
1324
+ console.log(`✅ Agent ${agentId} task cancelled successfully`);
1325
+ } else {
1326
+ console.log(`âš ī¸ No running task found for agent ${agentId}, clearing state anyway`);
1327
+ }
1328
+
1329
+ // ALWAYS send task_cancelled to clear browser UI state, regardless of kill success
1330
+ this.send({
1331
+ type: 'task_cancelled',
1332
+ agentId
1333
+ });
1334
+ }
1335
+ }
1336
+
1337
+ // ── Skills System ─────────────────────────────────────────────────────────
1338
+
1339
+ /**
1340
+ * Load skill files from agent workspace that match the current task.
1341
+ * Skills are .md files in {agentWorkspaceDir}/skills/ with YAML frontmatter.
1342
+ * Matching: description field must share 2+ words (>4 chars) with the task.
1343
+ */
1344
+ loadMatchingSkills(agentWorkspaceDir, task) {
1345
+ try {
1346
+ const skillsDir = path.join(agentWorkspaceDir, 'skills');
1347
+ if (!existsSync(skillsDir)) return '';
1348
+
1349
+ const files = readdirSync(skillsDir).filter(f => f.endsWith('.md'));
1350
+ if (files.length === 0) return '';
1351
+
1352
+ const taskWords = new Set(
1353
+ task.toLowerCase().split(/\W+/).filter(w => w.length > 4)
1354
+ );
1355
+ const matched = [];
1356
+
1357
+ for (const file of files) {
1358
+ try {
1359
+ const content = readFileSync(path.join(skillsDir, file), 'utf-8');
1360
+ const descMatch = content.match(/^---[\s\S]*?\ndescription:\s*["']?(.+?)["']?\s*\n[\s\S]*?---/m);
1361
+ if (descMatch) {
1362
+ const descWords = descMatch[1].toLowerCase().split(/\W+/).filter(w => w.length > 4);
1363
+ const overlap = descWords.filter(w => taskWords.has(w)).length;
1364
+ if (overlap >= 2) {
1365
+ matched.push(content.replace(/^---[\s\S]*?---\n/, '').trim());
1366
+ }
1367
+ }
1368
+ } catch { /* skip unreadable files */ }
1369
+ }
1370
+
1371
+ return matched.join('\n\n---\n\n');
1372
+ } catch {
1373
+ return '';
1374
+ }
1375
+ }
1376
+
1377
+ /**
1378
+ * Generate a short kebab-case name from a task description.
1379
+ */
1380
+ generateSkillName(task) {
1381
+ const stopwords = new Set(['that', 'this', 'with', 'from', 'have', 'been', 'your', 'will', 'would', 'could', 'should', 'please', 'make', 'create', 'build', 'system', 'context']);
1382
+ const words = task.toLowerCase()
1383
+ .replace(/[^a-z0-9\s]/g, '')
1384
+ .split(/\s+/)
1385
+ .filter(w => w.length > 3 && !stopwords.has(w))
1386
+ .slice(0, 3);
1387
+ return words.length > 0 ? words.join('-') : 'task-skill';
1388
+ }
1389
+
1390
+ /**
1391
+ * After a task that required >1 iteration succeeds, write a skill stub
1392
+ * to the agent workspace so future similar tasks start from a better position.
1393
+ */
1394
+ async saveIteratedSkill(agentId, agentWorkspaceDir, originalTask, iterations) {
1395
+ try {
1396
+ const skillsDir = path.join(agentWorkspaceDir, 'skills');
1397
+ mkdirSync(skillsDir, { recursive: true });
1398
+
1399
+ const skillName = this.generateSkillName(originalTask);
1400
+ const skillPath = path.join(skillsDir, `${skillName}.md`);
1401
+ if (existsSync(skillPath)) return; // Don't overwrite existing skill
1402
+
1403
+ // Strip injected context markers from the saved task description
1404
+ const taskPreview = originalTask
1405
+ .replace(/\[System context[\s\S]*?\]/g, '')
1406
+ .replace(/\[Relevant skill\][\s\S]*?---\n\n/g, '')
1407
+ .replace(/---\n\[VERIFICATION PROTOCOL\][\s\S]*$/m, '')
1408
+ .trim()
1409
+ .slice(0, 300);
1410
+
1411
+ const content = `---
1412
+ name: ${skillName}
1413
+ description: "Use for tasks involving: ${taskPreview.slice(0, 120).replace(/"/g, "'")}"
1414
+ ---
1415
+
1416
+ ## Task That Triggered This Skill
1417
+ ${taskPreview}
1418
+
1419
+ ## Notes
1420
+ This skill was auto-generated after the task required ${iterations} iterations.
1421
+ Review and add specific steps, pitfalls, and patterns that helped succeed.
1422
+ `;
1423
+ writeFileSync(skillPath, content, 'utf-8');
1424
+ console.log(`[${agentId}] 📚 Auto-saved skill: skills/${skillName}.md`);
1425
+ } catch (e) {
1426
+ console.log(`âš ī¸ Skill save failed: ${e.message}`);
1427
+ }
1428
+ }
1429
+
1430
+ // ── End Skills System ──────────────────────────────────────────────────────
1431
+
1432
+ // ── Designer file relay ────────────────────────────────────────────────────
1433
+
1434
+ async handleReadAgentFiles(message) {
1435
+ const { requestId, agentId } = message;
1436
+ const workDir = `/tmp/agentforge/agents/${agentId}`;
1437
+ const DEFAULT_FILES = ['IDENTITY.md', 'SOUL.md', 'AGENTS.md', 'MEMORY.md', 'AGENTFORGE.md', 'BOOTSTRAP.md', 'HEARTBEAT.md', 'TOOLS.md', 'USER.md'];
1438
+ const results = {};
1439
+
1440
+ for (const filename of DEFAULT_FILES) {
1441
+ try {
1442
+ results[filename] = readFileSync(path.join(workDir, filename), 'utf8');
1443
+ } catch {
1444
+ results[filename] = null;
1445
+ }
1446
+ }
1447
+
1448
+ // Also pick up any extra .md files in the workspace
1449
+ try {
1450
+ const entries = readdirSync(workDir).filter(e => e.endsWith('.md') && !DEFAULT_FILES.includes(e));
1451
+ for (const entry of entries) {
1452
+ try {
1453
+ results[entry] = readFileSync(path.join(workDir, entry), 'utf8');
1454
+ } catch {
1455
+ results[entry] = null;
1456
+ }
1457
+ }
1458
+ } catch { /* workspace may not exist yet */ }
1459
+
1460
+ this.send({ type: 'agent_files_result', requestId, files: results });
1461
+ }
1462
+
1463
+ async handleWriteAgentFile(message) {
1464
+ const { requestId, agentId, filename, content } = message;
1465
+ const workDir = `/tmp/agentforge/agents/${agentId}`;
1466
+ try {
1467
+ if (!existsSync(workDir)) mkdirSync(workDir, { recursive: true });
1468
+ writeFileSync(path.join(workDir, filename), content, 'utf8');
1469
+ this.send({ type: 'write_agent_file_result', requestId, success: true });
1470
+ } catch (err) {
1471
+ this.send({ type: 'write_agent_file_result', requestId, success: false, error: err.message });
1472
+ }
1473
+ }
1474
+
1475
+ send(message) {
1476
+ // Deduplicate task completions to prevent double-sends
1477
+ if (message.type === 'task_complete' && message.taskId) {
1478
+ if (this.recentCompletions.has(message.taskId)) {
1479
+ console.log(`âš ī¸ Skipping duplicate task_complete for ${message.taskId}`);
1480
+ return;
1481
+ }
1482
+ this.recentCompletions.add(message.taskId);
1483
+ // Clean up after TTL
1484
+ setTimeout(() => this.recentCompletions.delete(message.taskId), this.completionTTL);
1485
+ }
1486
+
1487
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) {
1488
+ this.ws.send(JSON.stringify(message));
1489
+ } else {
1490
+ // Queue important messages (completions, failures) for retry after reconnect
1491
+ const importantTypes = ['task_complete', 'task_failed', 'task_cancelled'];
1492
+ if (importantTypes.includes(message.type)) {
1493
+ if (this.pendingMessages.length < this.maxPendingMessages) {
1494
+ console.log(`đŸ“Ļ Queuing ${message.type} for ${message.taskId || message.agentId} (websocket disconnected)`);
1495
+ this.pendingMessages.push(message);
1496
+ } else {
1497
+ console.warn(`âš ī¸ Pending message queue full, dropping ${message.type}`);
1498
+ }
1499
+ }
1500
+ }
1501
+ }
1502
+
1503
+ resetStaleProcessingStates() {
1504
+ const now = Date.now();
1505
+ let resetCount = 0;
1506
+
1507
+ for (const [agentId, isProcessing] of this.agentProcessing.entries()) {
1508
+ if (isProcessing) {
1509
+ const startTime = this.processingStartTime.get(agentId);
1510
+ const elapsed = startTime ? now - startTime : Infinity;
1511
+
1512
+ // Reset if processing for too long OR if we just reconnected (previous task is dead)
1513
+ if (elapsed > this.PROCESSING_TIMEOUT_MS || !startTime) {
1514
+ console.log(`🔄 Resetting stale processing state for ${agentId} (was stuck for ${Math.round(elapsed/1000)}s)`);
1515
+ this.agentProcessing.set(agentId, false);
1516
+ this.processingStartTime.delete(agentId);
1517
+ resetCount++;
1518
+ }
1519
+ }
1520
+ }
1521
+
1522
+ if (resetCount > 0) {
1523
+ console.log(`🔄 Reset ${resetCount} stale processing states`);
1524
+ }
1525
+ }
1526
+
1527
+ processAllQueues() {
1528
+ // Kick-start processing for any queues that have pending tasks
1529
+ for (const [agentId, queue] of this.agentQueues.entries()) {
1530
+ if (queue.length > 0 && !this.agentProcessing.get(agentId)) {
1531
+ console.log(`📤 Resuming queue for ${agentId} (${queue.length} tasks waiting)`);
1532
+ this.processQueue(agentId);
1533
+ }
1534
+ }
1535
+ }
1536
+
1537
+ flushPendingMessages() {
1538
+ if (this.pendingMessages.length === 0) return;
1539
+
1540
+ console.log(`📤 Flushing ${this.pendingMessages.length} pending messages...`);
1541
+ const messages = [...this.pendingMessages];
1542
+ this.pendingMessages = [];
1543
+
1544
+ for (const message of messages) {
1545
+ if (this.ws && this.ws.readyState === WebSocket.OPEN) {
1546
+ console.log(` → Sending queued ${message.type} for ${message.taskId || message.agentId}`);
1547
+ this.ws.send(JSON.stringify(message));
1548
+ } else {
1549
+ // Still not connected, re-queue
1550
+ this.pendingMessages.push(message);
1551
+ break;
1552
+ }
1553
+ }
1554
+ }
1555
+
1556
+ // Record that an agent produced output (reset stuck detection)
1557
+ recordAgentActivity(agentId) {
1558
+ this.lastAgentActivity.set(agentId, Date.now());
1559
+ this.pingsSinceActivity.set(agentId, 0);
1560
+ }
1561
+
1562
+ // Collect detailed diagnostics for debug agent
1563
+ collectDiagnostics(agentId, taskId, error, reason) {
1564
+ const taskInfo = this.runningTasks.get(taskId);
1565
+ const lastActivity = this.lastAgentActivity.get(agentId);
1566
+ const pings = this.pingsSinceActivity.get(agentId) || 0;
1567
+ const processingTime = this.processingStartTime.get(agentId);
1568
+
1569
+ return {
1570
+ timestamp: new Date().toISOString(),
1571
+ reason, // 'error', 'stuck', 'timeout'
1572
+ agentId,
1573
+ taskId,
1574
+ error: error ? {
1575
+ message: error.message,
1576
+ stack: error.stack,
1577
+ name: error.name
1578
+ } : null,
1579
+ activity: {
1580
+ lastActivityTime: lastActivity,
1581
+ timeSinceActivity: lastActivity ? Date.now() - lastActivity : null,
1582
+ pingsSinceActivity: pings,
1583
+ processingStartTime: processingTime,
1584
+ processingDuration: processingTime ? Date.now() - processingTime : null
1585
+ },
1586
+ task: taskInfo ? {
1587
+ agentId: taskInfo.agentId,
1588
+ roomId: taskInfo.roomId,
1589
+ cancelled: taskInfo.cancelled
1590
+ } : null,
1591
+ process: {
1592
+ hasProcess: this.cli.activeAgents ? this.cli.activeAgents.has(agentId) : false,
1593
+ pid: this.cli.activeAgents ? this.cli.activeAgents.get(agentId)?.proc?.pid : null
1594
+ }
1595
+ };
1596
+ }
1597
+
1598
+ // Send debug report to server for investigation by debug agent
1599
+ sendDebugReport(diagnostics, userMessage) {
1600
+ this.send({
1601
+ type: 'debug_report',
1602
+ diagnostics,
1603
+ userMessage,
1604
+ timestamp: new Date().toISOString()
1605
+ });
1606
+ }
1607
+
1608
+ // Check for stuck agents on each ping
1609
+ checkForStuckAgents() {
1610
+ for (const [agentId, isProcessing] of this.agentProcessing.entries()) {
1611
+ if (isProcessing) {
1612
+ // First, check if the process is still alive - if so, it's probably just thinking
1613
+ const agentInfo = this.cli.activeAgents?.get(agentId);
1614
+ const pid = agentInfo?.proc?.pid;
1615
+ if (pid) {
1616
+ try {
1617
+ // process.kill(pid, 0) checks if process exists without killing it
1618
+ process.kill(pid, 0);
1619
+ // Process is alive - record activity to prevent false stuck detection
1620
+ // This handles cases where the CLI is blocking on API calls with no stdout
1621
+ this.recordAgentActivity(agentId);
1622
+ } catch (e) {
1623
+ // Process is dead - let stuck detection proceed
1624
+ console.log(`âš ī¸ Agent ${agentId} process (PID ${pid}) appears dead`);
1625
+ }
1626
+ }
1627
+
1628
+ // Increment ping counter for this agent
1629
+ const pings = (this.pingsSinceActivity.get(agentId) || 0) + 1;
1630
+ this.pingsSinceActivity.set(agentId, pings);
1631
+
1632
+ // Check if there's an active task for this agent
1633
+ let hasActiveTask = false;
1634
+ for (const [taskId, taskInfo] of this.runningTasks.entries()) {
1635
+ if (taskInfo.agentId === agentId && !taskInfo.cancelled) {
1636
+ hasActiveTask = true;
1637
+ break;
1638
+ }
1639
+ }
1640
+
1641
+ // Use very long threshold if task is active (10 pings = 300s / 5 min)
1642
+ // OpenClaw embedded agents can spend 2-3+ minutes on complex reasoning
1643
+ // Only mark as stuck if truly unresponsive (no output for 5+ minutes)
1644
+ const threshold = hasActiveTask ? 10 : this.STUCK_PING_THRESHOLD;
1645
+
1646
+ // Log warning when agent is quiet but not yet stuck (helps with debugging)
1647
+ if (pings >= this.STUCK_PING_THRESHOLD && pings < threshold) {
1648
+ console.log(`âš ī¸ Agent ${agentId} quiet for ${pings} pings (${Math.round((Date.now() - this.lastAgentActivity.get(agentId)) / 1000)}s), but task is active - waiting...`);
1649
+ }
1650
+
1651
+ if (pings >= threshold) {
1652
+ const lastActivity = this.lastAgentActivity.get(agentId);
1653
+ const elapsed = lastActivity ? Math.round((Date.now() - lastActivity) / 1000) : '?';
1654
+ const reason = hasActiveTask ? 'no output for 300s+ AND process dead' : 'no active task';
1655
+ console.log(`🚨 STUCK DETECTED: Agent ${agentId} has had ${pings} pings with no activity (${reason}, last activity: ${elapsed}s ago)`);
1656
+ console.log(`🚨 Force resetting agent ${agentId} to accept new tasks`);
1657
+
1658
+ // Find the task for diagnostics
1659
+ let stuckTaskId = null;
1660
+ for (const [taskId, taskInfo] of this.runningTasks.entries()) {
1661
+ if (taskInfo.agentId === agentId && !taskInfo.cancelled) {
1662
+ stuckTaskId = taskId;
1663
+ break;
1664
+ }
1665
+ }
1666
+
1667
+ // Collect diagnostics before cleanup
1668
+ const diagnostics = this.collectDiagnostics(
1669
+ agentId,
1670
+ stuckTaskId,
1671
+ new Error(`Agent unresponsive for ${elapsed}s`),
1672
+ 'stuck'
1673
+ );
1674
+
1675
+ // Force kill the process — try both runners
1676
+ this.cli.cancelAgent(agentId);
1677
+ this.hampagent?.cancelAgent(agentId);
1678
+
1679
+ // Clear all state for this agent
1680
+ this.agentProcessing.set(agentId, false);
1681
+ this.processingStartTime.delete(agentId);
1682
+ this.pingsSinceActivity.set(agentId, 0);
1683
+
1684
+ // Find and cancel any running task for this agent
1685
+ for (const [taskId, taskInfo] of this.runningTasks.entries()) {
1686
+ if (taskInfo.agentId === agentId && !taskInfo.cancelled) {
1687
+ taskInfo.cancelled = true;
1688
+ this.runningTasks.delete(taskId);
1689
+ this.send({
1690
+ type: 'task_failed',
1691
+ taskId,
1692
+ agentId,
1693
+ error: 'Agent became unresponsive (stuck detection triggered)'
1694
+ });
1695
+ }
1696
+ }
1697
+
1698
+ // Send debug report
1699
+ this.sendDebugReport(diagnostics, `Agent ${agentId} became unresponsive after ${elapsed}s with no activity`);
1700
+
1701
+ // Process any queued tasks
1702
+ const queue = this.agentQueues.get(agentId);
1703
+ if (queue && queue.length > 0) {
1704
+ console.log(`📤 Processing ${queue.length} queued tasks after stuck recovery`);
1705
+ setImmediate(() => this.processQueue(agentId));
1706
+ }
1707
+ }
1708
+ }
1709
+ }
1710
+ }
1711
+
1712
+ async shutdown() {
1713
+ console.log('🛑 Shutting down worker...');
1714
+ // Kill all active agent processes so they don't become orphans on restart
1715
+ if (this.cli && typeof this.cli.cancelAgent === 'function') {
1716
+ for (const agentId of this.agentProcessing.keys()) {
1717
+ if (this.agentProcessing.get(agentId)) {
1718
+ console.log(`đŸ”Ē Killing agent process: ${agentId}`);
1719
+ try { this.cli.cancelAgent(agentId); } catch (e) { /* already dead */ }
1720
+ }
1721
+ }
1722
+ }
1723
+ if (this.ws) {
1724
+ this.ws.close();
1725
+ }
1726
+ process.exit(0);
1727
+ }
1728
+
1729
+ // Find the AgentForge git repo root, regardless of whether worker is globally installed or run from source
1730
+ _findRepoRoot() {
1731
+ const home = homedir();
1732
+ const candidates = [
1733
+ path.resolve(__dirname, '../../..'), // running from source
1734
+ path.join(home, 'Desktop', 'Projects', 'AgentForge.ai'),
1735
+ path.join(home, 'Desktop', 'projects', 'AgentForge.ai'),
1736
+ path.join(home, 'Projects', 'AgentForge.ai'),
1737
+ path.join(home, 'projects', 'AgentForge.ai'),
1738
+ ];
1739
+ for (const candidate of candidates) {
1740
+ if (existsSync(path.join(candidate, '.git'))) return candidate;
1741
+ }
1742
+ return null;
1743
+ }
1744
+
1745
+ // Auto-update: git pull the repo, reinstall the global package, then exit so process manager restarts
1746
+ async _updateAndRestart() {
1747
+ const { execSync } = await import('child_process');
1748
+ const repoRoot = this._findRepoRoot();
1749
+ if (repoRoot) {
1750
+ try {
1751
+ console.log(`đŸ“Ļ git pull in ${repoRoot}...`);
1752
+ const out = execSync('git pull', { cwd: repoRoot, encoding: 'utf-8', timeout: 30000 });
1753
+ console.log(out.trim() || '(already up to date)');
1754
+ // Reinstall global package so updated source is picked up on next start
1755
+ try {
1756
+ console.log('đŸ“Ļ Reinstalling agentforge package...');
1757
+ execSync('npm install -g ./packages/worker', { cwd: repoRoot, encoding: 'utf-8', timeout: 60000, stdio: 'pipe' });
1758
+ console.log('✅ Package reinstalled');
1759
+ } catch (e) {
1760
+ console.warn('âš ī¸ npm install failed (will restart anyway):', e.message);
1761
+ }
1762
+ } catch (e) {
1763
+ console.warn('âš ī¸ git pull failed (will restart anyway):', e.message);
1764
+ }
1765
+ } else {
1766
+ console.log('â„šī¸ No git repo found — restarting with current code');
1767
+ }
1768
+ console.log('🔄 Auto-update complete — exiting so you can restart: agentforge start');
1769
+ process.exit(0);
1770
+ }
1771
+
1772
+ // Periodically check for updates and restart if new commits are available
1773
+ _startAutoUpdateCheck(intervalMs = 10 * 60 * 1000) {
1774
+ setInterval(async () => {
1775
+ const { execSync } = await import('child_process');
1776
+ const repoRoot = this._findRepoRoot();
1777
+ if (!repoRoot) return;
1778
+ try {
1779
+ execSync('git fetch --quiet', { cwd: repoRoot, timeout: 15000 });
1780
+ const status = execSync('git rev-list HEAD..origin/master --count', { cwd: repoRoot, encoding: 'utf-8', timeout: 5000 }).trim();
1781
+ if (parseInt(status) > 0) {
1782
+ console.log(`🆕 ${status} new commit(s) on origin/master — auto-updating (worker will restart)...`);
1783
+ if (this.ws?.readyState === 1) this.send({ type: 'worker_restarting' });
1784
+ setTimeout(() => this._updateAndRestart(), 300);
1785
+ }
1786
+ } catch (e) {
1787
+ // Non-fatal — network may be down, skip this check
1788
+ }
1789
+ }, intervalMs);
1790
+ }
1791
+ }