@hamp10/agentforge 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  import { spawn } from 'child_process';
2
- import { mkdirSync, writeFileSync, readFileSync, existsSync, readdirSync } from 'fs';
2
+ import { mkdirSync, writeFileSync, readFileSync, existsSync, readdirSync, unlinkSync } from 'fs';
3
3
  import { EventEmitter } from 'events';
4
4
  import { homedir } from 'os';
5
5
  import path from 'path';
@@ -40,6 +40,8 @@ export class OpenClawCLI extends EventEmitter {
40
40
  // OpenClaw Gateway streaming config — populated by worker.js on init
41
41
  this.gatewayPort = null;
42
42
  this.gatewayToken = null;
43
+ // AbortControllers for in-flight gateway requests — keyed by agentId
44
+ this.gatewayAbortControllers = new Map();
43
45
  }
44
46
 
45
47
  /**
@@ -52,6 +54,11 @@ export class OpenClawCLI extends EventEmitter {
52
54
  const url = `http://127.0.0.1:${this.gatewayPort}/v1/chat/completions`;
53
55
  const sessionKey = `agent:${agentId}:main`;
54
56
  let fullText = '';
57
+ const seenToolCallIds = new Set(); // also used in catch for timeout detection
58
+
59
+ // Register an AbortController so cancelAgent() can abort this fetch
60
+ const abortController = new AbortController();
61
+ this.gatewayAbortControllers.set(agentId, abortController);
55
62
 
56
63
  // Friendly names for tool calls shown as live messages to the user
57
64
  const toolLabels = {
@@ -67,6 +74,7 @@ export class OpenClawCLI extends EventEmitter {
67
74
  try {
68
75
  const res = await fetch(url, {
69
76
  method: 'POST',
77
+ signal: abortController.signal,
70
78
  headers: {
71
79
  'Authorization': `Bearer ${this.gatewayToken}`,
72
80
  'Content-Type': 'application/json',
@@ -79,7 +87,7 @@ export class OpenClawCLI extends EventEmitter {
79
87
  messages: [{ role: 'user', content: task }],
80
88
  stream: true,
81
89
  }),
82
- signal: AbortSignal.timeout(600_000), // 10 min max
90
+ // No timeout let agents run as long as needed
83
91
  });
84
92
  if (!res.ok) {
85
93
  console.warn(`[${agentId}] ⚠️ Streaming HTTP ${res.status} — falling back to subprocess`);
@@ -89,7 +97,7 @@ export class OpenClawCLI extends EventEmitter {
89
97
  // Parse SSE stream — emit text tokens AND tool call activity
90
98
  const decoder = new TextDecoder();
91
99
  let buffer = '';
92
- const seenToolCallIds = new Set(); // avoid duplicate tool_activity for same call_id
100
+ // seenToolCallIds declared above try block
93
101
  const pendingToolCalls = new Map(); // index -> { id, name, args }
94
102
 
95
103
  for await (const rawChunk of res.body) {
@@ -159,8 +167,23 @@ export class OpenClawCLI extends EventEmitter {
159
167
  }
160
168
  }
161
169
  // Return an object so callers can distinguish "success with no text" from "request failed"
162
- return { text: fullText, succeeded: true };
170
+ this.gatewayAbortControllers.delete(agentId);
171
+ return { text: fullText, succeeded: true, hadToolCalls: seenToolCallIds.size > 0 };
163
172
  } catch (err) {
173
+ this.gatewayAbortControllers.delete(agentId);
174
+ // Explicit cancel via cancelAgent() — treat as clean stop
175
+ if (err.name === 'AbortError') {
176
+ console.log(`[${agentId}] 🛑 Gateway request aborted by cancelAgent`);
177
+ return { text: fullText, succeeded: false, cancelled: true };
178
+ }
179
+ const isTimeout = err.name === 'TimeoutError' || err.message?.includes('timeout') || err.message?.includes('aborted');
180
+ if (isTimeout) {
181
+ console.warn(`[${agentId}] ⚠️ Streaming timed out after ${Math.round(fullText.length / 4)} tokens — gateway still processing`);
182
+ if (fullText.length > 0 || seenToolCallIds.size > 0) {
183
+ return { text: fullText, succeeded: true, hadToolCalls: seenToolCallIds.size > 0 };
184
+ }
185
+ return { text: '', succeeded: false, timedOut: true };
186
+ }
164
187
  console.warn(`[${agentId}] ⚠️ Streaming HTTP error: ${err.message} — falling back to subprocess`);
165
188
  return null; // null = request failed, subprocess fallback needed
166
189
  }
@@ -747,7 +770,7 @@ export class OpenClawCLI extends EventEmitter {
747
770
  // This ensures MEMORY.md and memory/ dir exist for memory persistence
748
771
  // Use bundled templates (packaged with worker) as primary source,
749
772
  // fall back to /tmp if somehow missing
750
- const bundledTemplateDir = path.join(path.dirname(new URL(import.meta.url).pathname), '../../templates/agent');
773
+ const bundledTemplateDir = path.join(path.dirname(new URL(import.meta.url).pathname), '../templates/agent');
751
774
  const templateDir = existsSync(bundledTemplateDir) ? bundledTemplateDir : '/tmp/agentforge/templates/agent';
752
775
  console.log(`📁 Using templates from: ${templateDir}`);
753
776
  try {
@@ -800,7 +823,11 @@ export class OpenClawCLI extends EventEmitter {
800
823
  * Run an agent task
801
824
  * Images are saved to workspace and referenced in message for vision model analysis
802
825
  */
803
- async runAgentTask(agentId, task, workDir, sessionId = null, image = null, browserProfile = null, imageWorkDir = null) {
826
+ async runAgentTask(agentId, task, workDir, sessionId = null, image = null, browserProfile = null, imageWorkDir = null, agentModel = null) {
827
+ // Apply per-agent model override before running (writes to openclaw.json + signals gateway)
828
+ if (agentModel) {
829
+ await this.setAgentModel(agentId, agentModel);
830
+ }
804
831
  // ── Gateway path disabled — subprocess shows live tool activity ──────────
805
832
  // Gateway path: SSE token streaming — tokens arrive live as the model generates.
806
833
  // Dashboard buffers tokens into sentences before showing each as a complete bubble.
@@ -808,20 +835,41 @@ export class OpenClawCLI extends EventEmitter {
808
835
  console.log(`\n🤖 Running agent (streaming): ${agentId}`);
809
836
  console.log(` Task: ${task.slice(0, 120)}${task.length > 120 ? '…' : ''}`);
810
837
  try {
811
- const streamResult = await this._runAgentTaskStreaming(agentId, task, sessionId);
838
+ // Retry up to 3 times on empty responses (API overloaded/rate limit)
839
+ let streamResult = null;
840
+ for (let attempt = 1; attempt <= 3; attempt++) {
841
+ streamResult = await this._runAgentTaskStreaming(agentId, task, sessionId);
842
+ if (streamResult === null) break; // connection failure — fall back to subprocess
843
+ if (streamResult.timedOut) break; // timeout — don't retry
844
+ if (streamResult.text || streamResult.hadToolCalls) break; // got real output — done
845
+ // Empty response — API overloaded, wait and retry
846
+ console.warn(`[${agentId}] ⚠️ Gateway returned empty response (attempt ${attempt}/3) — retrying in ${attempt * 5}s`);
847
+ if (attempt < 3) await new Promise(r => setTimeout(r, attempt * 5000));
848
+ }
812
849
  if (streamResult !== null) {
850
+ // Streaming timed out with zero output — gateway still processing, can't fall back
851
+ if (streamResult.timedOut) {
852
+ const errorText = "Agent timed out: the model took too long to respond. The gateway is still processing — please wait a moment and try again.";
853
+ let identity = { identityName: agentId, identityEmoji: '🤖' };
854
+ try { identity = await this.getAgentIdentity(agentId); } catch { /* ignore */ }
855
+ this.emit('agent_completed', { agentId, duration: 0, result: { output: errorText }, identity });
856
+ return { success: true, agentId, duration: 0, result: { output: errorText }, identity };
857
+ }
813
858
  // Gateway request succeeded (streamResult.succeeded === true).
814
- // Use the text response if any; if the agent only did tool work with no
815
- // text output, return empty string — do NOT fall back to subprocess (which
816
- // would re-run the same task a second time and corrupt the session state).
817
859
  const responseText = streamResult.text || '';
818
- if (!responseText) {
819
- console.log(`[${agentId}] Gateway task completed with no text output (tool-only task)`);
860
+ if (!responseText && !streamResult.hadToolCalls) {
861
+ // Still empty after retries API is down, fall back to subprocess
862
+ console.warn(`[${agentId}] ⚠️ Gateway returned no content after 3 attempts — falling back to subprocess`);
863
+ // fall through to subprocess path below
864
+ } else {
865
+ if (!responseText) {
866
+ console.log(`[${agentId}] ✅ Gateway task completed with no text output (tool-only task)`);
867
+ }
868
+ let identity = { identityName: agentId, identityEmoji: '🤖' };
869
+ try { identity = await this.getAgentIdentity(agentId); } catch { /* ignore */ }
870
+ this.emit('agent_completed', { agentId, duration: 0, result: { output: responseText }, identity });
871
+ return { success: true, agentId, duration: 0, result: { output: responseText }, identity };
820
872
  }
821
- let identity = { identityName: agentId, identityEmoji: '🤖' };
822
- try { identity = await this.getAgentIdentity(agentId); } catch { /* ignore */ }
823
- this.emit('agent_completed', { agentId, duration: 0, result: { output: responseText }, identity });
824
- return { success: true, agentId, duration: 0, result: { output: responseText }, identity };
825
873
  }
826
874
  console.warn(`[${agentId}] ⚠️ Streaming request failed — falling back to subprocess`);
827
875
  } catch (err) {
@@ -956,18 +1004,40 @@ export class OpenClawCLI extends EventEmitter {
956
1004
  const existingAgent = this.activeAgents.get(agentId);
957
1005
  if (existingAgent && existingAgent.proc && !existingAgent.proc.killed) {
958
1006
  console.log(`[${agentId}] 🔪 Killing lingering process (pid ${existingAgent.proc.pid}) before spawning new one`);
1007
+ const lingerPgid = existingAgent.pgid || existingAgent.proc.pid;
1008
+ try { process.kill(-lingerPgid, 'SIGKILL'); } catch (e) { /* already dead */ }
959
1009
  try { treeKill(existingAgent.proc.pid, 'SIGKILL'); } catch (e) { /* already dead */ }
960
1010
  this.activeAgents.delete(agentId);
961
1011
  // Wait for process to fully exit and release file locks before spawning
962
1012
  await new Promise(r => setTimeout(r, 800));
963
1013
  }
964
1014
 
1015
+ // Nuke any stale lock files before spawning — the gateway (a persistent process)
1016
+ // can hold session locks indefinitely after a killed task and never release them,
1017
+ // causing the next task to fail with "session file locked".
1018
+ const sessionDir = path.join(homedir(), '.openclaw', 'agents', agentId, 'sessions');
1019
+ if (existsSync(sessionDir)) {
1020
+ try {
1021
+ for (const f of readdirSync(sessionDir)) {
1022
+ if (f.endsWith('.lock')) {
1023
+ unlinkSync(path.join(sessionDir, f));
1024
+ console.log(`[${agentId}] 🔓 Cleared stale lock before spawn: ${f}`);
1025
+ }
1026
+ }
1027
+ } catch (e) { /* ignore */ }
1028
+ }
1029
+
965
1030
  // Change to working directory and run agent
966
1031
  // Use process.execPath (node) directly to avoid shell metacharacter issues
967
1032
  // with user message content (quotes, apostrophes, etc.)
1033
+ // detached: true makes the child a process group leader (pgid = proc.pid).
1034
+ // Background subprocesses spawned by openclaw's exec tool inherit this pgid
1035
+ // (bash disables job control when non-interactive, so & doesn't create new groups).
1036
+ // This lets cancelAgent kill the entire group — including orphaned bg processes.
968
1037
  const proc = spawn(process.execPath, [this.bin, ...args], {
969
1038
  cwd: workDir,
970
- env: agentEnv
1039
+ env: agentEnv,
1040
+ detached: true
971
1041
  });
972
1042
 
973
1043
  let output = '';
@@ -980,7 +1050,7 @@ export class OpenClawCLI extends EventEmitter {
980
1050
  const firstOutputTimer = setTimeout(() => {
981
1051
  if (!firstOutputSeen && !promiseSettled) {
982
1052
  console.warn(`[${agentId}] ⚠️ No output in 90s — openclaw hung, killing`);
983
- try { proc.kill('SIGKILL'); } catch (e) { /* already dead */ }
1053
+ treeKill(proc.pid, 'SIGKILL'); // kill entire process tree sub-agents included
984
1054
  if (!promiseSettled) {
985
1055
  promiseSettled = true;
986
1056
  this.activeAgents.delete(agentId);
@@ -1014,8 +1084,7 @@ export class OpenClawCLI extends EventEmitter {
1014
1084
  if (runCompleted || promiseSettled) return; // already handled
1015
1085
  console.log(`[${agentId}] ⚠️ Process still running 30s after agent end — force killing (compaction hung?)`);
1016
1086
  runCompleted = true;
1017
- try { proc.kill('SIGTERM'); } catch (e) { /* already dead */ }
1018
- setTimeout(() => { try { proc.kill('SIGKILL'); } catch (e) {} }, 1000);
1087
+ treeKill(proc.pid, 'SIGKILL'); // kill entire process tree sub-agents included
1019
1088
  if (!promiseSettled) {
1020
1089
  promiseSettled = true;
1021
1090
  const duration = Date.now() - startTime;
@@ -1050,16 +1119,7 @@ export class OpenClawCLI extends EventEmitter {
1050
1119
  completionTimer = setTimeout(async () => {
1051
1120
  if (proc && !proc.killed) {
1052
1121
  console.log(`[${agentId}] ⚠️ Process didn't exit after run completed, force killing`);
1053
- try {
1054
- proc.kill('SIGTERM');
1055
- setTimeout(() => {
1056
- if (!proc.killed) {
1057
- proc.kill('SIGKILL');
1058
- }
1059
- }, 1000);
1060
- } catch (e) {
1061
- // Process might already be dead
1062
- }
1122
+ treeKill(proc.pid, 'SIGKILL'); // kill entire process tree — sub-agents included
1063
1123
  }
1064
1124
  // Task completed successfully — resolve now instead of waiting for close event.
1065
1125
  // The close event can hang indefinitely if openclaw's child subprocesses keep
@@ -1199,6 +1259,8 @@ export class OpenClawCLI extends EventEmitter {
1199
1259
  proc.on('close', async (code) => {
1200
1260
  const duration = Date.now() - startTime;
1201
1261
  clearTimeout(firstOutputTimer);
1262
+ // Sweep up any surviving child processes (e.g. sessions_spawn sub-agents that outlived the parent)
1263
+ try { treeKill(proc.pid, 'SIGKILL'); } catch (e) { /* already dead */ }
1202
1264
 
1203
1265
  // Clear the completion timer if it's still running
1204
1266
  if (completionTimer) {
@@ -1278,12 +1340,13 @@ export class OpenClawCLI extends EventEmitter {
1278
1340
  }
1279
1341
  });
1280
1342
 
1281
- // Track active agent
1343
+ // Track active agent (pgid = proc.pid because detached:true makes it a group leader)
1282
1344
  this.activeAgents.set(agentId, {
1283
1345
  proc,
1284
1346
  startTime,
1285
1347
  task,
1286
- workDir
1348
+ workDir,
1349
+ pgid: proc.pid
1287
1350
  });
1288
1351
  });
1289
1352
  }
@@ -1320,6 +1383,73 @@ export class OpenClawCLI extends EventEmitter {
1320
1383
  return results;
1321
1384
  }
1322
1385
 
1386
+ /**
1387
+ * Set per-agent model override in ~/.openclaw/openclaw.json
1388
+ * Adds the model to the catalog allowlist and sets the agent's primary model.
1389
+ * Called before running a task so the spawned openclaw process (or gateway) picks it up.
1390
+ */
1391
+ async setAgentModel(agentId, modelString) {
1392
+ if (!modelString) return;
1393
+ const cfgPath = path.join(homedir(), '.openclaw', 'openclaw.json');
1394
+ try {
1395
+ let cfg = {};
1396
+ if (existsSync(cfgPath)) {
1397
+ cfg = JSON.parse(readFileSync(cfgPath, 'utf-8'));
1398
+ }
1399
+ // Ensure structure
1400
+ if (!cfg.agents) cfg.agents = {};
1401
+ if (!cfg.agents.defaults) cfg.agents.defaults = {};
1402
+ if (!cfg.agents.defaults.models) cfg.agents.defaults.models = {};
1403
+ if (!cfg.agents.list) cfg.agents.list = [];
1404
+
1405
+ // Add to catalog/allowlist so OpenClaw permits this model
1406
+ if (!cfg.agents.defaults.models[modelString]) {
1407
+ cfg.agents.defaults.models[modelString] = {};
1408
+ }
1409
+
1410
+ // Find or create per-agent entry
1411
+ let agentEntry = cfg.agents.list.find(a => a.id === agentId);
1412
+ if (!agentEntry) {
1413
+ agentEntry = { id: agentId };
1414
+ cfg.agents.list.push(agentEntry);
1415
+ }
1416
+
1417
+ // Set per-agent model override
1418
+ agentEntry.model = { primary: modelString };
1419
+
1420
+ writeFileSync(cfgPath, JSON.stringify(cfg, null, 2));
1421
+ console.log(`[${agentId}] 🔧 Model override → ${modelString}`);
1422
+
1423
+ // Signal gateway to reload config so streaming path picks up the new model
1424
+ if (this.gatewayPort && this.gatewayToken) {
1425
+ try {
1426
+ // Try PATCH /api/config (OpenClaw gateway REST endpoint)
1427
+ const res = await fetch(`http://127.0.0.1:${this.gatewayPort}/api/config`, {
1428
+ method: 'PATCH',
1429
+ headers: {
1430
+ 'Authorization': `Bearer ${this.gatewayToken}`,
1431
+ 'Content-Type': 'application/json',
1432
+ },
1433
+ body: JSON.stringify({
1434
+ agents: {
1435
+ defaults: { models: cfg.agents.defaults.models },
1436
+ list: cfg.agents.list,
1437
+ },
1438
+ }),
1439
+ signal: AbortSignal.timeout(2000),
1440
+ });
1441
+ if (res.ok) {
1442
+ console.log(`[${agentId}] ✅ Gateway config patched`);
1443
+ }
1444
+ } catch {
1445
+ // Not fatal — subprocess path reads config fresh each time
1446
+ }
1447
+ }
1448
+ } catch (err) {
1449
+ console.warn(`[${agentId}] ⚠️ setAgentModel failed: ${err.message}`);
1450
+ }
1451
+ }
1452
+
1323
1453
  /**
1324
1454
  * List all agents (with timeout to prevent hanging)
1325
1455
  */
@@ -1490,27 +1620,61 @@ export class OpenClawCLI extends EventEmitter {
1490
1620
  * Cancel a running agent task by killing the process tree immediately
1491
1621
  */
1492
1622
  cancelAgent(agentId) {
1623
+ // Kill gateway fetch if running in streaming mode
1624
+ const abortCtrl = this.gatewayAbortControllers.get(agentId);
1625
+ if (abortCtrl) {
1626
+ console.log(`🛑 Aborting gateway stream for agent ${agentId}`);
1627
+ abortCtrl.abort();
1628
+ this.gatewayAbortControllers.delete(agentId);
1629
+ }
1630
+
1493
1631
  const agentInfo = this.activeAgents.get(agentId);
1494
1632
  if (!agentInfo || !agentInfo.proc) {
1633
+ if (abortCtrl) return true; // gateway abort counts as success
1495
1634
  console.log(`⚠️ No running process found for agent ${agentId}`);
1496
1635
  return false;
1497
1636
  }
1498
1637
 
1499
- const { proc } = agentInfo;
1638
+ const { proc, pgid } = agentInfo;
1500
1639
  const pid = proc.pid;
1501
-
1502
- console.log(`🛑 Killing process tree for agent ${agentId} (PID: ${pid})`);
1503
-
1640
+
1641
+ console.log(`🛑 Killing process group for agent ${agentId} (PID: ${pid}, PGID: ${pgid || pid})`);
1642
+
1504
1643
  // Clean up tracking immediately
1505
1644
  this.activeAgents.delete(agentId);
1506
-
1507
- // Use tree-kill to kill the entire process tree with SIGKILL (immediate, no grace period)
1645
+
1646
+ // Kill the entire process GROUP first (catches background/orphaned subprocesses that
1647
+ // escaped the process tree, e.g. `python3 -m http.server &` spawned by openclaw exec).
1648
+ // Since we spawn with detached:true, pgid == proc.pid and all non-interactive bash
1649
+ // background processes inherit this pgid.
1650
+ const groupId = pgid || pid;
1651
+ try {
1652
+ process.kill(-groupId, 'SIGKILL');
1653
+ console.log(`✅ Process group -${groupId} killed`);
1654
+ } catch (e) {
1655
+ // Group may already be dead or process detached before we set pgid
1656
+ console.log(`⚠️ process group kill (-${groupId}): ${e.message}`);
1657
+ }
1658
+
1659
+ // Also use tree-kill as belt-and-suspenders for any sub-processes that changed pgid
1508
1660
  treeKill(pid, 'SIGKILL', (err) => {
1509
1661
  if (err) {
1510
1662
  console.log(`⚠️ tree-kill error (process may already be dead): ${err.message}`);
1511
1663
  } else {
1512
1664
  console.log(`✅ Process tree ${pid} killed successfully`);
1513
1665
  }
1666
+ // Clean up any stale lock files left by the killed process
1667
+ const sessionDir = path.join(homedir(), '.openclaw', 'agents', agentId, 'sessions');
1668
+ if (existsSync(sessionDir)) {
1669
+ try {
1670
+ for (const f of readdirSync(sessionDir)) {
1671
+ if (f.endsWith('.lock')) {
1672
+ unlinkSync(path.join(sessionDir, f));
1673
+ console.log(`🔓 Removed stale lock: ${f}`);
1674
+ }
1675
+ }
1676
+ } catch (e) { /* ignore */ }
1677
+ }
1514
1678
  });
1515
1679
 
1516
1680
  this.emit('agent_cancelled', { agentId });