@hamp10/agentforge 0.2.13 → 0.2.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/OllamaAgent.js +205 -157
package/package.json
CHANGED
package/src/OllamaAgent.js
CHANGED
|
@@ -8,7 +8,9 @@ import { fileURLToPath } from 'url';
|
|
|
8
8
|
const execAsync = promisify(exec);
|
|
9
9
|
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
// Tool definitions — used both for embedded system prompt (qwen3 format)
|
|
12
|
+
// and kept as structured objects for _toolDesc lookups.
|
|
13
|
+
const TOOL_DEFS = [
|
|
12
14
|
{
|
|
13
15
|
type: 'function',
|
|
14
16
|
function: {
|
|
@@ -108,28 +110,92 @@ const TOOLS = [
|
|
|
108
110
|
}
|
|
109
111
|
];
|
|
110
112
|
|
|
113
|
+
// Build the <tools> XML block to embed in the system prompt.
|
|
114
|
+
// Ollama's `tools` API parameter is broken for qwen3 models (malformed JSON in the prompt).
|
|
115
|
+
// The reliable fix is to embed tool definitions directly in the system prompt as XML.
|
|
116
|
+
const TOOLS_XML = `<tools>\n${TOOL_DEFS.map(t => JSON.stringify(t)).join('\n')}\n</tools>`;
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Parse <tool_call>...</tool_call> blocks from streamed content.
|
|
120
|
+
* qwen3-vl native format: <tool_call>{"name": "bash", "arguments": {"command": "..."}}</tool_call>
|
|
121
|
+
* Returns array of {name, arguments} or null if no complete tool calls found.
|
|
122
|
+
*/
|
|
123
|
+
function _parseToolCallTags(content) {
|
|
124
|
+
const calls = [];
|
|
125
|
+
const re = /<tool_call>([\s\S]*?)<\/tool_call>/g;
|
|
126
|
+
let match;
|
|
127
|
+
while ((match = re.exec(content)) !== null) {
|
|
128
|
+
try {
|
|
129
|
+
const obj = JSON.parse(match[1].trim());
|
|
130
|
+
const name = obj.name || obj.tool;
|
|
131
|
+
const args = obj.arguments ?? obj.args ?? {};
|
|
132
|
+
if (typeof name === 'string') calls.push({ name, arguments: args });
|
|
133
|
+
} catch {}
|
|
134
|
+
}
|
|
135
|
+
return calls.length > 0 ? calls : null;
|
|
136
|
+
}
|
|
137
|
+
|
|
111
138
|
/**
|
|
112
139
|
* Detect text-based tool calls from model content.
|
|
113
|
-
*
|
|
114
|
-
*
|
|
115
|
-
*
|
|
140
|
+
* qwen3-vl:8b outputs tool calls as JSON in content rather than tool_calls field.
|
|
141
|
+
* Supports two schemas:
|
|
142
|
+
* - {name, arguments} (OpenAI-style)
|
|
143
|
+
* - {tool, args} (qwen3 native style)
|
|
144
|
+
* Supports both compact (one JSON per line) and pretty-printed multi-line JSON blocks.
|
|
145
|
+
* Returns array of {name, arguments} if content is ONLY tool calls, else null.
|
|
116
146
|
*/
|
|
117
147
|
function _parseTextToolCalls(content) {
|
|
118
148
|
if (!content) return null;
|
|
119
|
-
const
|
|
120
|
-
if (
|
|
149
|
+
const trimmed = content.trim();
|
|
150
|
+
if (!trimmed.startsWith('{') && !trimmed.startsWith('[')) return null;
|
|
151
|
+
|
|
152
|
+
// Normalise a single parsed object into {name, arguments}
|
|
153
|
+
const normalise = (obj) => {
|
|
154
|
+
if (typeof obj.name === 'string' && obj.arguments !== undefined) {
|
|
155
|
+
const args = typeof obj.arguments === 'string' ? JSON.parse(obj.arguments) : obj.arguments;
|
|
156
|
+
return { name: obj.name, arguments: args };
|
|
157
|
+
}
|
|
158
|
+
if (typeof obj.tool === 'string' && obj.args !== undefined) {
|
|
159
|
+
return { name: obj.tool, arguments: obj.args };
|
|
160
|
+
}
|
|
161
|
+
return null;
|
|
162
|
+
};
|
|
163
|
+
|
|
164
|
+
// Try parsing the whole content as a single JSON object/array
|
|
165
|
+
try {
|
|
166
|
+
const obj = JSON.parse(trimmed);
|
|
167
|
+
if (Array.isArray(obj)) {
|
|
168
|
+
const calls = obj.map(normalise);
|
|
169
|
+
if (calls.every(Boolean)) return calls;
|
|
170
|
+
return null;
|
|
171
|
+
}
|
|
172
|
+
const call = normalise(obj);
|
|
173
|
+
if (call) return [call];
|
|
174
|
+
return null;
|
|
175
|
+
} catch {}
|
|
176
|
+
|
|
177
|
+
// Try extracting multiple top-level JSON objects (separated by newlines/whitespace)
|
|
121
178
|
const calls = [];
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
179
|
+
let i = 0;
|
|
180
|
+
while (i < trimmed.length) {
|
|
181
|
+
// Skip whitespace/newlines between objects
|
|
182
|
+
while (i < trimmed.length && /\s/.test(trimmed[i])) i++;
|
|
183
|
+
if (i >= trimmed.length) break;
|
|
184
|
+
if (trimmed[i] !== '{') return null; // Non-JSON between objects — bail
|
|
185
|
+
// Find matching closing brace
|
|
186
|
+
let depth = 0, j = i;
|
|
187
|
+
while (j < trimmed.length) {
|
|
188
|
+
if (trimmed[j] === '{') depth++;
|
|
189
|
+
else if (trimmed[j] === '}') { depth--; if (depth === 0) { j++; break; } }
|
|
190
|
+
j++;
|
|
132
191
|
}
|
|
192
|
+
try {
|
|
193
|
+
const obj = JSON.parse(trimmed.slice(i, j));
|
|
194
|
+
const call = normalise(obj);
|
|
195
|
+
if (!call) return null;
|
|
196
|
+
calls.push(call);
|
|
197
|
+
i = j;
|
|
198
|
+
} catch { return null; }
|
|
133
199
|
}
|
|
134
200
|
return calls.length > 0 ? calls : null;
|
|
135
201
|
}
|
|
@@ -215,22 +281,23 @@ export class OllamaAgent extends EventEmitter {
|
|
|
215
281
|
// Load conversation history from disk (session persistence)
|
|
216
282
|
const history = this._loadHistory(agentId, workDir, sessionId);
|
|
217
283
|
|
|
284
|
+
// For qwen3 models: embed tool definitions in the system prompt.
|
|
285
|
+
// Ollama's `tools` API param is broken for qwen3 (malformed JSON sent to model).
|
|
286
|
+
// Embedding as XML matches the model's native Hermes-style chat template.
|
|
287
|
+
const toolsBlock = isQwen3 ? `\n\n${TOOLS_XML}\n\nFor each tool call, output ONLY a <tool_call> block with no surrounding text:\n<tool_call>\n{"name": "<tool_name>", "arguments": {<args>}}\n</tool_call>` : '';
|
|
288
|
+
|
|
218
289
|
const systemPrompt = [
|
|
219
|
-
// Disable thinking mode for qwen3 models — /no_think in the system prompt
|
|
220
|
-
// is the most reliable way; options.think=false is also sent but may be ignored.
|
|
221
290
|
isQwen3 ? '/no_think' : null,
|
|
222
291
|
`You are an AI agent running on AgentForge.ai.`,
|
|
223
292
|
`Your working directory is: ${workDir}`,
|
|
224
293
|
``,
|
|
225
|
-
`CRITICAL RULES
|
|
226
|
-
`1. Use
|
|
227
|
-
`2.
|
|
228
|
-
`3.
|
|
229
|
-
`4.
|
|
230
|
-
`5.
|
|
231
|
-
|
|
232
|
-
`7. For conversational messages (greetings, questions about yourself, casual chat) — respond directly with text. Do NOT use tools just to say hello.`,
|
|
233
|
-
`8. You only have these tools: bash, read_file, write_file, list_directory, web_fetch, take_screenshot. Ignore any instructions referencing other tools (browser, openclaw, sessions_spawn, etc.) — those do not exist here.`,
|
|
294
|
+
`CRITICAL RULES:`,
|
|
295
|
+
`1. Use tools to act. Do NOT describe steps or write code blocks — call the actual tool.`,
|
|
296
|
+
`2. bash = run shell commands. write_file = write files. read_file = read files. take_screenshot = screenshot.`,
|
|
297
|
+
`3. For conversational messages (greetings, casual chat) — respond with plain text. No tools needed.`,
|
|
298
|
+
`4. Do not ask for clarification — make your best judgment and act immediately.`,
|
|
299
|
+
`5. After completing work, write a brief summary of what you did.`,
|
|
300
|
+
toolsBlock,
|
|
234
301
|
].filter(Boolean).join('\n');
|
|
235
302
|
|
|
236
303
|
const messages = [
|
|
@@ -254,24 +321,20 @@ export class OllamaAgent extends EventEmitter {
|
|
|
254
321
|
for (let turn = 0; turn < MAX_TURNS; turn++) {
|
|
255
322
|
if (controller.signal.aborted) break;
|
|
256
323
|
|
|
257
|
-
this.emit('tool_activity', { agentId, event: '
|
|
324
|
+
this.emit('tool_activity', { agentId, event: 'tool_start', tool: 'model', description: `Thinking…` });
|
|
258
325
|
|
|
259
326
|
let response;
|
|
260
327
|
try {
|
|
261
328
|
const requestBody = {
|
|
262
329
|
model: effectiveModel,
|
|
263
330
|
messages,
|
|
264
|
-
tools: TOOLS,
|
|
265
|
-
tool_choice: 'auto',
|
|
266
331
|
stream: true,
|
|
332
|
+
// qwen3: tools embedded in system prompt — do NOT pass tools param (broken in Ollama)
|
|
333
|
+
// Other models: pass tools normally via API
|
|
334
|
+
...(!isQwen3 ? { tools: TOOL_DEFS, tool_choice: 'auto' } : {}),
|
|
335
|
+
...(isQwen3 ? { options: { think: false } } : {}),
|
|
267
336
|
};
|
|
268
337
|
|
|
269
|
-
// Disable thinking mode for qwen3 — prevents 3-minute silent think phases
|
|
270
|
-
// and makes tool-call JSON output reliable.
|
|
271
|
-
if (isQwen3) {
|
|
272
|
-
requestBody.options = { think: false };
|
|
273
|
-
}
|
|
274
|
-
|
|
275
338
|
response = await fetch(`${this.baseUrl}/v1/chat/completions`, {
|
|
276
339
|
method: 'POST',
|
|
277
340
|
headers: { 'Content-Type': 'application/json' },
|
|
@@ -289,14 +352,15 @@ export class OllamaAgent extends EventEmitter {
|
|
|
289
352
|
}
|
|
290
353
|
|
|
291
354
|
// ── Stream the SSE response ──
|
|
292
|
-
//
|
|
293
|
-
//
|
|
294
|
-
|
|
295
|
-
let
|
|
355
|
+
// For qwen3: model emits text tokens including <tool_call>...</tool_call> blocks.
|
|
356
|
+
// Stream text live to user, but suppress content inside <tool_call> tags.
|
|
357
|
+
// For other models: also handle delta.tool_calls in the standard OpenAI format.
|
|
358
|
+
let streamContent = ''; // full accumulated text (including tool_call tags for qwen3)
|
|
359
|
+
let visibleContent = ''; // text emitted live to user (no tool_call or think blocks)
|
|
360
|
+
let streamToolCalls = {}; // OpenAI-format tool calls (non-qwen3 models)
|
|
296
361
|
let inThinkBlock = false;
|
|
297
|
-
let
|
|
362
|
+
let inToolCallBlock = false; // inside <tool_call>...</tool_call>
|
|
298
363
|
let rawTokenCount = 0;
|
|
299
|
-
let rawThinkChars = 0;
|
|
300
364
|
|
|
301
365
|
const reader = response.body.getReader();
|
|
302
366
|
const decoder = new TextDecoder();
|
|
@@ -309,7 +373,7 @@ export class OllamaAgent extends EventEmitter {
|
|
|
309
373
|
|
|
310
374
|
buf += decoder.decode(value, { stream: true });
|
|
311
375
|
const lines = buf.split('\n');
|
|
312
|
-
buf = lines.pop();
|
|
376
|
+
buf = lines.pop();
|
|
313
377
|
|
|
314
378
|
for (const line of lines) {
|
|
315
379
|
if (!line.startsWith('data: ')) continue;
|
|
@@ -321,7 +385,7 @@ export class OllamaAgent extends EventEmitter {
|
|
|
321
385
|
const delta = evt.choices?.[0]?.delta;
|
|
322
386
|
if (!delta) continue;
|
|
323
387
|
|
|
324
|
-
//
|
|
388
|
+
// Standard OpenAI tool_calls (non-qwen3 models)
|
|
325
389
|
if (delta.tool_calls) {
|
|
326
390
|
for (const tc of delta.tool_calls) {
|
|
327
391
|
const idx = tc.index ?? 0;
|
|
@@ -332,169 +396,153 @@ export class OllamaAgent extends EventEmitter {
|
|
|
332
396
|
}
|
|
333
397
|
}
|
|
334
398
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
const thinkEnd = thinkBuffer.indexOf('</think>', i);
|
|
357
|
-
if (thinkEnd === -1) {
|
|
358
|
-
// still inside think block, keep buffering
|
|
359
|
-
i = thinkBuffer.length;
|
|
360
|
-
} else {
|
|
361
|
-
inThinkBlock = false;
|
|
362
|
-
i = thinkEnd + 8;
|
|
363
|
-
}
|
|
364
|
-
}
|
|
399
|
+
if (!delta.content) continue;
|
|
400
|
+
rawTokenCount++;
|
|
401
|
+
streamContent += delta.content;
|
|
402
|
+
|
|
403
|
+
// Process token through think + tool_call filters, emit visible text live
|
|
404
|
+
// We scan only the new delta token against the current buffer state
|
|
405
|
+
const chunk = delta.content;
|
|
406
|
+
let visible = '';
|
|
407
|
+
// Simple per-token state machine — handles split tags across tokens by tracking state flags
|
|
408
|
+
if (!inThinkBlock && !inToolCallBlock) {
|
|
409
|
+
// Check if this chunk starts a filtered block
|
|
410
|
+
if (streamContent.includes('<think>') && !streamContent.includes('</think>')) {
|
|
411
|
+
inThinkBlock = true;
|
|
412
|
+
// emit text before the <think> tag
|
|
413
|
+
const before = streamContent.lastIndexOf('<think>');
|
|
414
|
+
// already streamed everything before this point; just suppress from here
|
|
415
|
+
} else if (streamContent.includes('<tool_call>') && !streamContent.slice(streamContent.lastIndexOf('<tool_call>')).includes('</tool_call>')) {
|
|
416
|
+
inToolCallBlock = true;
|
|
417
|
+
// Text before <tool_call> on this same token — already emitted or trivial
|
|
418
|
+
} else if (!inThinkBlock && !inToolCallBlock) {
|
|
419
|
+
visible = chunk;
|
|
365
420
|
}
|
|
366
|
-
|
|
421
|
+
}
|
|
422
|
+
// Exit think block
|
|
423
|
+
if (inThinkBlock && streamContent.includes('</think>')) {
|
|
424
|
+
inThinkBlock = false;
|
|
425
|
+
}
|
|
426
|
+
// Exit tool_call block
|
|
427
|
+
if (inToolCallBlock && streamContent.slice(streamContent.lastIndexOf('<tool_call>')).includes('</tool_call>')) {
|
|
428
|
+
inToolCallBlock = false;
|
|
429
|
+
}
|
|
367
430
|
|
|
368
|
-
|
|
369
|
-
|
|
431
|
+
if (visible && !inThinkBlock && !inToolCallBlock) {
|
|
432
|
+
visibleContent += visible;
|
|
433
|
+
this.emit('agent_output', { agentId, output: visible, isChunk: true });
|
|
370
434
|
}
|
|
371
435
|
}
|
|
372
436
|
}
|
|
373
437
|
|
|
374
|
-
console.log(` [${agentId}] 📊 Stream done: ${rawTokenCount} tokens, ${streamContent.length}
|
|
438
|
+
console.log(` [${agentId}] 📊 Stream done: ${rawTokenCount} tokens, ${streamContent.length} chars, ${visibleContent.length} visible, apiToolCalls=${Object.keys(streamToolCalls).length}`);
|
|
375
439
|
if (streamContent) console.log(` [${agentId}] 📝 First 200 chars: ${streamContent.slice(0, 200)}`);
|
|
376
440
|
|
|
377
|
-
//
|
|
378
|
-
//
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
// Don't emit here — detection block below handles it
|
|
441
|
+
// ── Extract tool calls from content ───────────────────────────────────
|
|
442
|
+
// For qwen3: parse <tool_call> XML tags from full streamed content.
|
|
443
|
+
// For others: use API-level tool_calls already accumulated above.
|
|
444
|
+
let parsedTagCalls = null;
|
|
445
|
+
if (isQwen3 && Object.keys(streamToolCalls).length === 0) {
|
|
446
|
+
parsedTagCalls = _parseToolCallTags(streamContent);
|
|
447
|
+
if (parsedTagCalls) {
|
|
448
|
+
console.log(` [${agentId}] 🔍 ${parsedTagCalls.length} <tool_call> tag(s) detected`);
|
|
386
449
|
}
|
|
387
450
|
}
|
|
388
451
|
|
|
389
|
-
//
|
|
390
|
-
|
|
391
|
-
// If detected, convert to streamToolCalls and suppress the raw JSON output.
|
|
392
|
-
// Otherwise, emit the text content to the dashboard.
|
|
393
|
-
if (Object.keys(streamToolCalls).length === 0 && streamContent) {
|
|
452
|
+
// Fallback: try legacy JSON-blob detection if no tags found
|
|
453
|
+
if (!parsedTagCalls && Object.keys(streamToolCalls).length === 0 && streamContent) {
|
|
394
454
|
const textCalls = _parseTextToolCalls(streamContent);
|
|
395
455
|
if (textCalls) {
|
|
396
|
-
console.log(` [${agentId}] 🔍 ${textCalls.length} text
|
|
397
|
-
|
|
398
|
-
streamToolCalls[i] = {
|
|
399
|
-
id: `text-${i}`,
|
|
400
|
-
type: 'function',
|
|
401
|
-
function: { name: tc.name, arguments: JSON.stringify(tc.arguments) }
|
|
402
|
-
};
|
|
403
|
-
});
|
|
404
|
-
streamContent = ''; // Don't display raw JSON to user
|
|
405
|
-
} else {
|
|
406
|
-
// Regular text response — emit to dashboard
|
|
407
|
-
allOutput += streamContent;
|
|
408
|
-
if (streamContent.trim()) this.emit('agent_output', { agentId, output: streamContent });
|
|
456
|
+
console.log(` [${agentId}] 🔍 ${textCalls.length} JSON text tool call(s) detected (legacy fallback)`);
|
|
457
|
+
parsedTagCalls = textCalls;
|
|
409
458
|
}
|
|
410
459
|
}
|
|
411
460
|
|
|
461
|
+
// Convert tag/text calls into streamToolCalls structure
|
|
462
|
+
if (parsedTagCalls) {
|
|
463
|
+
parsedTagCalls.forEach((tc, i) => {
|
|
464
|
+
streamToolCalls[i] = { id: `tag-${i}`, type: 'function', function: { name: tc.name, arguments: JSON.stringify(tc.arguments) } };
|
|
465
|
+
});
|
|
466
|
+
// Don't accumulate raw tool_call XML as user-visible output
|
|
467
|
+
} else if (visibleContent) {
|
|
468
|
+
allOutput += visibleContent;
|
|
469
|
+
}
|
|
470
|
+
|
|
412
471
|
this.emit('tool_activity', {
|
|
413
472
|
agentId,
|
|
414
473
|
event: 'api_call_end',
|
|
415
474
|
description: `✅ Ollama responded`
|
|
416
475
|
});
|
|
417
476
|
|
|
418
|
-
//
|
|
477
|
+
// ── Push assistant message ────────────────────────────────────────────
|
|
419
478
|
const toolCallsArray = Object.values(streamToolCalls);
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
content: streamContent ||
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
479
|
+
if (isQwen3) {
|
|
480
|
+
// qwen3: assistant message is the raw streamed content (includes <tool_call> tags)
|
|
481
|
+
messages.push({ role: 'assistant', content: streamContent || '' });
|
|
482
|
+
} else {
|
|
483
|
+
messages.push({
|
|
484
|
+
role: 'assistant',
|
|
485
|
+
content: visibleContent || null,
|
|
486
|
+
tool_calls: toolCallsArray.length > 0 ? toolCallsArray : undefined
|
|
487
|
+
});
|
|
488
|
+
}
|
|
427
489
|
|
|
428
|
-
// ──
|
|
429
|
-
if (
|
|
430
|
-
for (const toolCall of
|
|
490
|
+
// ── Execute tool calls ────────────────────────────────────────────────
|
|
491
|
+
if (toolCallsArray.length > 0) {
|
|
492
|
+
for (const toolCall of toolCallsArray) {
|
|
431
493
|
if (controller.signal.aborted) break;
|
|
432
494
|
|
|
433
495
|
const { name, arguments: args } = toolCall.function;
|
|
434
|
-
|
|
496
|
+
let parsedArgs;
|
|
497
|
+
try { parsedArgs = typeof args === 'string' ? JSON.parse(args) : args; }
|
|
498
|
+
catch { parsedArgs = {}; }
|
|
435
499
|
|
|
436
500
|
this.emit('tool_activity', {
|
|
437
|
-
agentId,
|
|
438
|
-
event: 'tool_start',
|
|
439
|
-
tool: name,
|
|
501
|
+
agentId, event: 'tool_start', tool: name,
|
|
440
502
|
description: this._toolDesc(name, parsedArgs)
|
|
441
503
|
});
|
|
442
|
-
|
|
443
504
|
console.log(` [${agentId}] 🔧 ${name}: ${JSON.stringify(parsedArgs).slice(0, 120)}`);
|
|
444
505
|
toolsUsed.push(name);
|
|
445
506
|
|
|
446
507
|
const result = await this._executeTool(name, parsedArgs, workDir);
|
|
447
508
|
|
|
448
|
-
this.emit('tool_activity', {
|
|
449
|
-
agentId,
|
|
450
|
-
event: 'tool_end',
|
|
451
|
-
tool: name,
|
|
452
|
-
description: `✓ ${name}`
|
|
453
|
-
});
|
|
509
|
+
this.emit('tool_activity', { agentId, event: 'tool_end', tool: name, description: `✓ ${name}` });
|
|
454
510
|
|
|
455
|
-
// If the tool returned an image (base64), push it as a vision message
|
|
456
|
-
// so the model can actually see what was captured.
|
|
457
|
-
// Also forward to dashboard so the user sees the screenshot in chat.
|
|
458
511
|
const isImageResult = typeof result === 'string' && result.startsWith('data:image/');
|
|
459
512
|
if (isImageResult && parsedArgs.send_to_user === true) {
|
|
460
513
|
this.emit('agent_image', { agentId, image: result });
|
|
461
514
|
}
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
role: 'user',
|
|
471
|
-
|
|
472
|
-
images: [base64]
|
|
473
|
-
});
|
|
515
|
+
|
|
516
|
+
if (isQwen3) {
|
|
517
|
+
// qwen3 format: tool results go back as user messages with <tool_response> tags
|
|
518
|
+
if (isImageResult && isVision) {
|
|
519
|
+
const base64 = result.replace(/^data:image\/\w+;base64,/, '');
|
|
520
|
+
messages.push({ role: 'user', content: '<tool_response>\n[Screenshot captured]\n</tool_response>', images: [base64] });
|
|
521
|
+
} else {
|
|
522
|
+
const resultText = isImageResult ? '[Screenshot captured — vision model needed to analyze]' : String(result).slice(0, 8000);
|
|
523
|
+
messages.push({ role: 'user', content: `<tool_response>\n${resultText}\n</tool_response>` });
|
|
524
|
+
}
|
|
474
525
|
} else {
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
tool_call_id: toolCall.id || undefined,
|
|
478
|
-
|
|
479
|
-
|
|
526
|
+
// Standard OpenAI format
|
|
527
|
+
if (isImageResult && isVision) {
|
|
528
|
+
messages.push({ role: 'tool', tool_call_id: toolCall.id || undefined, content: '[Screenshot captured — see image attached]' });
|
|
529
|
+
const base64 = result.replace(/^data:image\/\w+;base64,/, '');
|
|
530
|
+
messages.push({ role: 'user', content: 'Here is the screenshot:', images: [base64] });
|
|
531
|
+
} else {
|
|
532
|
+
messages.push({ role: 'tool', tool_call_id: toolCall.id || undefined, content: isImageResult ? '[Screenshot captured]' : String(result).slice(0, 8000) });
|
|
533
|
+
}
|
|
480
534
|
}
|
|
481
535
|
}
|
|
482
|
-
//
|
|
483
|
-
continue;
|
|
536
|
+
continue; // loop back for next model turn
|
|
484
537
|
}
|
|
485
538
|
|
|
486
|
-
// ── No tool calls: final answer
|
|
487
|
-
if (
|
|
488
|
-
finalContent = streamContent;
|
|
489
|
-
}
|
|
539
|
+
// ── No tool calls: final answer ───────────────────────────────────────
|
|
540
|
+
if (visibleContent) finalContent = visibleContent;
|
|
490
541
|
break;
|
|
491
542
|
|
|
492
543
|
}
|
|
493
544
|
|
|
494
|
-
|
|
495
|
-
if (!finalContent && allOutput) {
|
|
496
|
-
finalContent = allOutput;
|
|
497
|
-
}
|
|
545
|
+
if (!finalContent && allOutput) finalContent = allOutput;
|
|
498
546
|
|
|
499
547
|
// If still no output (model did only tool calls, never wrote text), ask for a summary.
|
|
500
548
|
// Use only the last 6 messages to avoid context overflow after many tool-call turns.
|