@hamp10/agentforge 0.2.14 → 0.2.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/OllamaAgent.js +182 -213
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hamp10/agentforge",
3
- "version": "0.2.14",
3
+ "version": "0.2.16",
4
4
  "description": "AgentForge worker — connect your machine to agentforge.ai",
5
5
  "type": "module",
6
6
  "bin": {
@@ -8,106 +8,84 @@ import { fileURLToPath } from 'url';
8
8
  const execAsync = promisify(exec);
9
9
  const __dirname = path.dirname(fileURLToPath(import.meta.url));
10
10
 
11
- const TOOLS = [
11
+ // Minimal tool definitions — one compact JSON per line, embedded in system prompt.
12
+ // Ollama's `tools` API param is broken for qwen3 (github.com/ollama/ollama/issues/14601).
13
+ // Descriptions kept short to fit within qwen3-vl:8b's 4096 token context.
14
+ const TOOL_DEFS = [
12
15
  {
13
16
  type: 'function',
14
17
  function: {
15
18
  name: 'bash',
16
- description: 'Execute a shell command in the working directory. Returns stdout and stderr.',
17
- parameters: {
18
- type: 'object',
19
- properties: {
20
- command: { type: 'string', description: 'The shell command to run' }
21
- },
22
- required: ['command']
23
- }
19
+ description: 'Run a shell command. Returns stdout/stderr.',
20
+ parameters: { type: 'object', properties: { command: { type: 'string' } }, required: ['command'] }
24
21
  }
25
22
  },
26
23
  {
27
24
  type: 'function',
28
25
  function: {
29
26
  name: 'read_file',
30
- description: 'Read the full contents of a file.',
31
- parameters: {
32
- type: 'object',
33
- properties: {
34
- path: { type: 'string', description: 'Path to the file (absolute or relative to workdir)' }
35
- },
36
- required: ['path']
37
- }
27
+ description: 'Read a file.',
28
+ parameters: { type: 'object', properties: { path: { type: 'string' } }, required: ['path'] }
38
29
  }
39
30
  },
40
31
  {
41
32
  type: 'function',
42
33
  function: {
43
34
  name: 'write_file',
44
- description: 'Write content to a file, creating it and any missing parent directories.',
45
- parameters: {
46
- type: 'object',
47
- properties: {
48
- path: { type: 'string', description: 'Path to write (absolute or relative to workdir)' },
49
- content: { type: 'string', description: 'File content to write' }
50
- },
51
- required: ['path', 'content']
52
- }
35
+ description: 'Write a file.',
36
+ parameters: { type: 'object', properties: { path: { type: 'string' }, content: { type: 'string' } }, required: ['path', 'content'] }
53
37
  }
54
38
  },
55
39
  {
56
40
  type: 'function',
57
41
  function: {
58
42
  name: 'list_directory',
59
- description: 'List files and subdirectories at a path.',
60
- parameters: {
61
- type: 'object',
62
- properties: {
63
- path: { type: 'string', description: 'Directory path (absolute or relative to workdir)' }
64
- },
65
- required: ['path']
66
- }
43
+ description: 'List files in a directory.',
44
+ parameters: { type: 'object', properties: { path: { type: 'string' } }, required: ['path'] }
67
45
  }
68
46
  },
69
47
  {
70
48
  type: 'function',
71
49
  function: {
72
50
  name: 'web_fetch',
73
- description: 'Fetch the text content of a URL (first 4000 chars).',
74
- parameters: {
75
- type: 'object',
76
- properties: {
77
- url: { type: 'string', description: 'URL to fetch' }
78
- },
79
- required: ['url']
80
- }
51
+ description: 'Fetch text content from a URL.',
52
+ parameters: { type: 'object', properties: { url: { type: 'string' } }, required: ['url'] }
81
53
  }
82
54
  },
83
55
  {
84
56
  type: 'function',
85
57
  function: {
86
58
  name: 'take_screenshot',
87
- description: 'Take a screenshot of the current screen or the agent browser (port 9223). Returns base64 image data you can analyze visually. Use this to check what a webpage looks like, verify a build result, or monitor a running process. Set send_to_user=true ONLY when the user explicitly asked to see a screenshot.',
88
- parameters: {
89
- type: 'object',
90
- properties: {
91
- target: {
92
- type: 'string',
93
- enum: ['screen', 'browser'],
94
- description: 'screen = full screen capture. browser = screenshot of the agent browser (port 9223).'
95
- },
96
- url: {
97
- type: 'string',
98
- description: 'Optional: navigate the browser to this URL before taking the screenshot.'
99
- },
100
- send_to_user: {
101
- type: 'boolean',
102
- description: 'If true, send the screenshot to the user\'s chat. Only set this when the user explicitly asked to see a screenshot or visual output.'
103
- }
104
- },
105
- required: ['target']
106
- }
59
+ description: 'Screenshot the screen. Set send_to_user=true only if user asked to see it.',
60
+ parameters: { type: 'object', properties: { target: { type: 'string', enum: ['screen', 'browser'] }, send_to_user: { type: 'boolean' } }, required: ['target'] }
107
61
  }
108
62
  }
109
63
  ];
110
64
 
65
+ // Minimal <tools> XML for system prompt — one compact JSON per line, no outer array.
66
+ // Per qwen3 Hermes chat template (tokenizer_config.json).
67
+ const TOOLS_XML = `<tools>\n${TOOL_DEFS.map(t => JSON.stringify(t.function)).join('\n')}\n</tools>`;
68
+
69
+ /**
70
+ * Parse <tool_call>...</tool_call> blocks from streamed content.
71
+ * qwen3-vl native format: <tool_call>{"name": "bash", "arguments": {"command": "..."}}</tool_call>
72
+ * Returns array of {name, arguments} or null if no complete tool calls found.
73
+ */
74
+ function _parseToolCallTags(content) {
75
+ const calls = [];
76
+ const re = /<tool_call>([\s\S]*?)<\/tool_call>/g;
77
+ let match;
78
+ while ((match = re.exec(content)) !== null) {
79
+ try {
80
+ const obj = JSON.parse(match[1].trim());
81
+ const name = obj.name || obj.tool;
82
+ const args = obj.arguments ?? obj.args ?? {};
83
+ if (typeof name === 'string') calls.push({ name, arguments: args });
84
+ } catch {}
85
+ }
86
+ return calls.length > 0 ? calls : null;
87
+ }
88
+
111
89
  /**
112
90
  * Detect text-based tool calls from model content.
113
91
  * qwen3-vl:8b outputs tool calls as JSON in content rather than tool_calls field.
@@ -254,23 +232,34 @@ export class OllamaAgent extends EventEmitter {
254
232
  // Load conversation history from disk (session persistence)
255
233
  const history = this._loadHistory(agentId, workDir, sessionId);
256
234
 
257
- const systemPrompt = [
258
- // Disable thinking mode for qwen3 models/no_think in the system prompt
259
- // is the most reliable way; options.think=false is also sent but may be ignored.
260
- isQwen3 ? '/no_think' : null,
261
- `You are an AI agent running on AgentForge.ai.`,
262
- `Your working directory is: ${workDir}`,
263
- ``,
264
- `CRITICAL RULES — follow these exactly:`,
265
- `1. Use the provided tools to complete the task. Do NOT write Python code, pseudo-code, or code blocks to simulate tool calls.`,
266
- `2. To run a command, call the "bash" tool. To read a file, call "read_file". To write, call "write_file". To take a screenshot, call "take_screenshot".`,
267
- `3. Every action must be a real tool call — not described in text, not shown as code.`,
268
- `4. When you take a screenshot, you will receive the actual image back and can see it.`,
269
- `5. When you are done, write a clear summary of what you accomplished.`,
270
- `6. Do not ask for clarification — make your best judgment and act.`,
271
- `7. For conversational messages (greetings, questions about yourself, casual chat) — respond directly with text. Do NOT use tools just to say hello.`,
272
- `8. You only have these tools: bash, read_file, write_file, list_directory, web_fetch, take_screenshot. Ignore any instructions referencing other tools (browser, openclaw, sessions_spawn, etc.) — those do not exist here.`,
273
- ].filter(Boolean).join('\n');
235
+ // System prompt uses the exact format from qwen3's Hermes chat template.
236
+ // Tools are embedded as <tools> XMLnever passed via the API `tools` param (broken in Ollama).
237
+ const systemPrompt = isQwen3
238
+ ? [
239
+ '/no_think',
240
+ `You are a helpful assistant. Working directory: ${workDir}`,
241
+ ``,
242
+ `# Tools`,
243
+ ``,
244
+ `You may call one or more functions to complete the task.`,
245
+ ``,
246
+ `You are provided with function signatures within <tools></tools> XML tags:`,
247
+ TOOLS_XML,
248
+ ``,
249
+ `For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:`,
250
+ `<tool_call>`,
251
+ `{"name": <function-name>, "arguments": <args-json-object>}`,
252
+ `</tool_call>`,
253
+ ``,
254
+ `Rules:`,
255
+ `- Call tools to take actions. Do NOT describe what you would do — just do it.`,
256
+ `- For simple conversation (greetings, questions) respond with plain text, no tools.`,
257
+ `- After finishing, write a brief summary.`,
258
+ ].join('\n')
259
+ : [
260
+ `You are a helpful AI agent. Working directory: ${workDir}`,
261
+ `Use the provided tools to complete tasks. Don't describe — act.`,
262
+ ].join('\n');
274
263
 
275
264
  const messages = [
276
265
  { role: 'system', content: systemPrompt },
@@ -300,17 +289,16 @@ export class OllamaAgent extends EventEmitter {
300
289
  const requestBody = {
301
290
  model: effectiveModel,
302
291
  messages,
303
- tools: TOOLS,
304
- tool_choice: 'auto',
305
292
  stream: true,
293
+ // qwen3: tools embedded in system prompt — do NOT pass tools param (broken in Ollama for qwen3)
294
+ // Other models: pass tools normally
295
+ ...(!isQwen3 ? { tools: TOOL_DEFS, tool_choice: 'auto' } : {}),
296
+ options: {
297
+ num_ctx: 8192, // explicit context — Ollama defaults to 2048 which is too small
298
+ ...(isQwen3 ? { think: false } : {}), // CRITICAL: thinking + tools corrupts template
299
+ },
306
300
  };
307
301
 
308
- // Disable thinking mode for qwen3 — prevents 3-minute silent think phases
309
- // and makes tool-call JSON output reliable.
310
- if (isQwen3) {
311
- requestBody.options = { think: false };
312
- }
313
-
314
302
  response = await fetch(`${this.baseUrl}/v1/chat/completions`, {
315
303
  method: 'POST',
316
304
  headers: { 'Content-Type': 'application/json' },
@@ -328,14 +316,15 @@ export class OllamaAgent extends EventEmitter {
328
316
  }
329
317
 
330
318
  // ── Stream the SSE response ──
331
- // Accumulate content and tool calls from streaming deltas.
332
- // Filter out <think>...</think> blocks (qwen3 chain-of-thought) never show to user.
333
- let streamContent = '';
334
- let streamToolCalls = {};
319
+ // For qwen3: model emits text tokens including <tool_call>...</tool_call> blocks.
320
+ // Stream text live to user, but suppress content inside <tool_call> tags.
321
+ // For other models: also handle delta.tool_calls in the standard OpenAI format.
322
+ let streamContent = ''; // full accumulated text (including tool_call tags for qwen3)
323
+ let visibleContent = ''; // text emitted live to user (no tool_call or think blocks)
324
+ let streamToolCalls = {}; // OpenAI-format tool calls (non-qwen3 models)
335
325
  let inThinkBlock = false;
336
- let thinkBuffer = '';
326
+ let inToolCallBlock = false; // inside <tool_call>...</tool_call>
337
327
  let rawTokenCount = 0;
338
- let rawThinkChars = 0;
339
328
 
340
329
  const reader = response.body.getReader();
341
330
  const decoder = new TextDecoder();
@@ -348,7 +337,7 @@ export class OllamaAgent extends EventEmitter {
348
337
 
349
338
  buf += decoder.decode(value, { stream: true });
350
339
  const lines = buf.split('\n');
351
- buf = lines.pop(); // keep incomplete line
340
+ buf = lines.pop();
352
341
 
353
342
  for (const line of lines) {
354
343
  if (!line.startsWith('data: ')) continue;
@@ -360,7 +349,7 @@ export class OllamaAgent extends EventEmitter {
360
349
  const delta = evt.choices?.[0]?.delta;
361
350
  if (!delta) continue;
362
351
 
363
- // Accumulate tool call deltas
352
+ // Standard OpenAI tool_calls (non-qwen3 models)
364
353
  if (delta.tool_calls) {
365
354
  for (const tc of delta.tool_calls) {
366
355
  const idx = tc.index ?? 0;
@@ -371,173 +360,153 @@ export class OllamaAgent extends EventEmitter {
371
360
  }
372
361
  }
373
362
 
374
- // Stream content tokens, filtering <think>...</think> blocks
375
- if (delta.content) {
376
- rawTokenCount++;
377
- if (inThinkBlock || delta.content.startsWith('<think')) rawThinkChars += delta.content.length;
378
- thinkBuffer += delta.content;
379
-
380
- // Process thinkBuffer to extract non-thinking text
381
- let out = '';
382
- let i = 0;
383
- while (i < thinkBuffer.length) {
384
- if (!inThinkBlock) {
385
- const thinkStart = thinkBuffer.indexOf('<think>', i);
386
- if (thinkStart === -1) {
387
- out += thinkBuffer.slice(i);
388
- i = thinkBuffer.length;
389
- } else {
390
- out += thinkBuffer.slice(i, thinkStart);
391
- inThinkBlock = true;
392
- i = thinkStart + 7;
393
- }
394
- } else {
395
- const thinkEnd = thinkBuffer.indexOf('</think>', i);
396
- if (thinkEnd === -1) {
397
- // still inside think block, keep buffering
398
- i = thinkBuffer.length;
399
- } else {
400
- inThinkBlock = false;
401
- i = thinkEnd + 8;
402
- }
403
- }
404
- }
405
- thinkBuffer = inThinkBlock ? thinkBuffer.slice(thinkBuffer.lastIndexOf('<think>')) : '';
406
-
407
- streamContent += out;
408
- // Stream text tokens live — but only if output clearly isn't JSON tool calls.
409
- // If the accumulated content starts with '{', it may be a tool call — buffer silently.
410
- // Otherwise emit immediately so the user sees live output.
411
- if (out && !streamContent.trimStart().startsWith('{')) {
412
- this.emit('agent_output', { agentId, output: out, isChunk: true });
363
+ if (!delta.content) continue;
364
+ rawTokenCount++;
365
+ streamContent += delta.content;
366
+
367
+ // Process token through think + tool_call filters, emit visible text live
368
+ // We scan only the new delta token against the current buffer state
369
+ const chunk = delta.content;
370
+ let visible = '';
371
+ // Simple per-token state machine — handles split tags across tokens by tracking state flags
372
+ if (!inThinkBlock && !inToolCallBlock) {
373
+ // Check if this chunk starts a filtered block
374
+ if (streamContent.includes('<think>') && !streamContent.includes('</think>')) {
375
+ inThinkBlock = true;
376
+ // emit text before the <think> tag
377
+ const before = streamContent.lastIndexOf('<think>');
378
+ // already streamed everything before this point; just suppress from here
379
+ } else if (streamContent.includes('<tool_call>') && !streamContent.slice(streamContent.lastIndexOf('<tool_call>')).includes('</tool_call>')) {
380
+ inToolCallBlock = true;
381
+ // Text before <tool_call> on this same token — already emitted or trivial
382
+ } else if (!inThinkBlock && !inToolCallBlock) {
383
+ visible = chunk;
413
384
  }
414
385
  }
386
+ // Exit think block
387
+ if (inThinkBlock && streamContent.includes('</think>')) {
388
+ inThinkBlock = false;
389
+ }
390
+ // Exit tool_call block
391
+ if (inToolCallBlock && streamContent.slice(streamContent.lastIndexOf('<tool_call>')).includes('</tool_call>')) {
392
+ inToolCallBlock = false;
393
+ }
394
+
395
+ if (visible && !inThinkBlock && !inToolCallBlock) {
396
+ visibleContent += visible;
397
+ this.emit('agent_output', { agentId, output: visible, isChunk: true });
398
+ }
415
399
  }
416
400
  }
417
401
 
418
- console.log(` [${agentId}] 📊 Stream done: ${rawTokenCount} tokens, ${streamContent.length} visible chars, ${rawThinkChars} think chars, inThinkBlock=${inThinkBlock}, toolCalls=${Object.keys(streamToolCalls).length}`);
402
+ console.log(` [${agentId}] 📊 Stream done: ${rawTokenCount} tokens, ${streamContent.length} chars, ${visibleContent.length} visible, apiToolCalls=${Object.keys(streamToolCalls).length}`);
419
403
  if (streamContent) console.log(` [${agentId}] 📝 First 200 chars: ${streamContent.slice(0, 200)}`);
420
404
 
421
- // If the model only generated <think> content and nothing visible, extract the thought as the answer.
422
- // This happens with qwen3-vl:8b when think:false is silently ignored.
423
- if (!streamContent && Object.keys(streamToolCalls).length === 0 && rawThinkChars > 0 && thinkBuffer.length > 0) {
424
- // Strip the <think> tag and use the thought content as the response
425
- const thoughtContent = thinkBuffer.replace(/^<think>\s*/i, '').replace(/\s*<\/think>\s*$/i, '').trim();
426
- if (thoughtContent) {
427
- console.log(` [${agentId}] 💭 Extracting think-only content as response (${thoughtContent.length} chars)`);
428
- streamContent = thoughtContent;
429
- // Don't emit here — detection block below handles it
405
+ // ── Extract tool calls from content ───────────────────────────────────
406
+ // For qwen3: parse <tool_call> XML tags from full streamed content.
407
+ // For others: use API-level tool_calls already accumulated above.
408
+ let parsedTagCalls = null;
409
+ if (isQwen3 && Object.keys(streamToolCalls).length === 0) {
410
+ parsedTagCalls = _parseToolCallTags(streamContent);
411
+ if (parsedTagCalls) {
412
+ console.log(` [${agentId}] 🔍 ${parsedTagCalls.length} <tool_call> tag(s) detected`);
430
413
  }
431
414
  }
432
415
 
433
- // ── Detect text-based tool calls or accumulate text content ──────────
434
- // qwen3-vl:8b outputs tool calls as JSON in content (not tool_calls field).
435
- // If detected, convert to streamToolCalls so they actually execute.
436
- // If not tool calls, content was already streamed live token-by-token above.
437
- if (Object.keys(streamToolCalls).length === 0 && streamContent) {
416
+ // Fallback: try legacy JSON-blob detection if no tags found
417
+ if (!parsedTagCalls && Object.keys(streamToolCalls).length === 0 && streamContent) {
438
418
  const textCalls = _parseTextToolCalls(streamContent);
439
419
  if (textCalls) {
440
- console.log(` [${agentId}] 🔍 ${textCalls.length} text-based tool call(s) detected converting to function calls`);
441
- textCalls.forEach((tc, i) => {
442
- streamToolCalls[i] = {
443
- id: `text-${i}`,
444
- type: 'function',
445
- function: { name: tc.name, arguments: JSON.stringify(tc.arguments) }
446
- };
447
- });
448
- streamContent = ''; // Suppress raw JSON from output
449
- } else {
450
- // Regular text — already emitted live above, just accumulate
451
- allOutput += streamContent;
420
+ console.log(` [${agentId}] 🔍 ${textCalls.length} JSON text tool call(s) detected (legacy fallback)`);
421
+ parsedTagCalls = textCalls;
452
422
  }
453
423
  }
454
424
 
425
+ // Convert tag/text calls into streamToolCalls structure
426
+ if (parsedTagCalls) {
427
+ parsedTagCalls.forEach((tc, i) => {
428
+ streamToolCalls[i] = { id: `tag-${i}`, type: 'function', function: { name: tc.name, arguments: JSON.stringify(tc.arguments) } };
429
+ });
430
+ // Don't accumulate raw tool_call XML as user-visible output
431
+ } else if (visibleContent) {
432
+ allOutput += visibleContent;
433
+ }
434
+
455
435
  this.emit('tool_activity', {
456
436
  agentId,
457
437
  event: 'api_call_end',
458
438
  description: `✅ Ollama responded`
459
439
  });
460
440
 
461
- // Reconstruct message from streamed parts
441
+ // ── Push assistant message ────────────────────────────────────────────
462
442
  const toolCallsArray = Object.values(streamToolCalls);
463
- const message = {
464
- role: 'assistant',
465
- content: streamContent || null,
466
- tool_calls: toolCallsArray.length > 0 ? toolCallsArray : undefined
467
- };
468
-
469
- messages.push(message);
443
+ if (isQwen3) {
444
+ // qwen3: assistant message is the raw streamed content (includes <tool_call> tags)
445
+ messages.push({ role: 'assistant', content: streamContent || '' });
446
+ } else {
447
+ messages.push({
448
+ role: 'assistant',
449
+ content: visibleContent || null,
450
+ tool_calls: toolCallsArray.length > 0 ? toolCallsArray : undefined
451
+ });
452
+ }
470
453
 
471
- // ── Handle tool calls ──
472
- if (message.tool_calls && message.tool_calls.length > 0) {
473
- for (const toolCall of message.tool_calls) {
454
+ // ── Execute tool calls ────────────────────────────────────────────────
455
+ if (toolCallsArray.length > 0) {
456
+ for (const toolCall of toolCallsArray) {
474
457
  if (controller.signal.aborted) break;
475
458
 
476
459
  const { name, arguments: args } = toolCall.function;
477
- const parsedArgs = typeof args === 'string' ? JSON.parse(args) : args;
460
+ let parsedArgs;
461
+ try { parsedArgs = typeof args === 'string' ? JSON.parse(args) : args; }
462
+ catch { parsedArgs = {}; }
478
463
 
479
464
  this.emit('tool_activity', {
480
- agentId,
481
- event: 'tool_start',
482
- tool: name,
465
+ agentId, event: 'tool_start', tool: name,
483
466
  description: this._toolDesc(name, parsedArgs)
484
467
  });
485
-
486
468
  console.log(` [${agentId}] 🔧 ${name}: ${JSON.stringify(parsedArgs).slice(0, 120)}`);
487
469
  toolsUsed.push(name);
488
470
 
489
471
  const result = await this._executeTool(name, parsedArgs, workDir);
490
472
 
491
- this.emit('tool_activity', {
492
- agentId,
493
- event: 'tool_end',
494
- tool: name,
495
- description: `✓ ${name}`
496
- });
473
+ this.emit('tool_activity', { agentId, event: 'tool_end', tool: name, description: `✓ ${name}` });
497
474
 
498
- // If the tool returned an image (base64), push it as a vision message
499
- // so the model can actually see what was captured.
500
- // Also forward to dashboard so the user sees the screenshot in chat.
501
475
  const isImageResult = typeof result === 'string' && result.startsWith('data:image/');
502
476
  if (isImageResult && parsedArgs.send_to_user === true) {
503
477
  this.emit('agent_image', { agentId, image: result });
504
478
  }
505
- if (isImageResult && isVision) {
506
- messages.push({
507
- role: 'tool',
508
- tool_call_id: toolCall.id || undefined,
509
- content: '[Screenshot captured — see image attached]'
510
- });
511
- const base64 = result.replace(/^data:image\/\w+;base64,/, '');
512
- messages.push({
513
- role: 'user',
514
- content: 'Here is the screenshot:',
515
- images: [base64]
516
- });
479
+
480
+ if (isQwen3) {
481
+ // qwen3 format: tool results go back as user messages with <tool_response> tags
482
+ if (isImageResult && isVision) {
483
+ const base64 = result.replace(/^data:image\/\w+;base64,/, '');
484
+ messages.push({ role: 'user', content: '<tool_response>\n[Screenshot captured]\n</tool_response>', images: [base64] });
485
+ } else {
486
+ const resultText = isImageResult ? '[Screenshot captured — vision model needed to analyze]' : String(result).slice(0, 8000);
487
+ messages.push({ role: 'user', content: `<tool_response>\n${resultText}\n</tool_response>` });
488
+ }
517
489
  } else {
518
- messages.push({
519
- role: 'tool',
520
- tool_call_id: toolCall.id || undefined,
521
- content: isImageResult ? '[Screenshot captured — install a vision model to analyze images]' : String(result)
522
- });
490
+ // Standard OpenAI format
491
+ if (isImageResult && isVision) {
492
+ messages.push({ role: 'tool', tool_call_id: toolCall.id || undefined, content: '[Screenshot captured — see image attached]' });
493
+ const base64 = result.replace(/^data:image\/\w+;base64,/, '');
494
+ messages.push({ role: 'user', content: 'Here is the screenshot:', images: [base64] });
495
+ } else {
496
+ messages.push({ role: 'tool', tool_call_id: toolCall.id || undefined, content: isImageResult ? '[Screenshot captured]' : String(result).slice(0, 8000) });
497
+ }
523
498
  }
524
499
  }
525
- // Loop back model will respond to the tool results
526
- continue;
500
+ continue; // loop back for next model turn
527
501
  }
528
502
 
529
- // ── No tool calls: final answer already streamed above ──
530
- if (streamContent) {
531
- finalContent = streamContent;
532
- }
503
+ // ── No tool calls: final answer ───────────────────────────────────────
504
+ if (visibleContent) finalContent = visibleContent;
533
505
  break;
534
506
 
535
507
  }
536
508
 
537
- // Use all accumulated output if final turn had no content (agent ended after tool calls)
538
- if (!finalContent && allOutput) {
539
- finalContent = allOutput;
540
- }
509
+ if (!finalContent && allOutput) finalContent = allOutput;
541
510
 
542
511
  // If still no output (model did only tool calls, never wrote text), ask for a summary.
543
512
  // Use only the last 6 messages to avoid context overflow after many tool-call turns.