npm - @hamp10/agentforge - Versions diffs - 0.2.14 → 0.2.16 - Mend

@hamp10/agentforge 0.2.14 → 0.2.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/package.json +1 -1
package/src/OllamaAgent.js +182 -213

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@hamp10/agentforge",
-  "version": "0.2.14",
+  "version": "0.2.16",
   "description": "AgentForge worker — connect your machine to agentforge.ai",
   "type": "module",
   "bin": {

package/src/OllamaAgent.js CHANGED Viewed

@@ -8,106 +8,84 @@ import { fileURLToPath } from 'url';
 const execAsync = promisify(exec);
 const __dirname = path.dirname(fileURLToPath(import.meta.url));
-const TOOLS = [
+// Minimal tool definitions — one compact JSON per line, embedded in system prompt.
+// Ollama's `tools` API param is broken for qwen3 (github.com/ollama/ollama/issues/14601).
+// Descriptions kept short to fit within qwen3-vl:8b's 4096 token context.
+const TOOL_DEFS = [
   {
     type: 'function',
     function: {
       name: 'bash',
-      description: 'Execute a shell command in the working directory. Returns stdout and stderr.',
-      parameters: {
-        type: 'object',
-        properties: {
-          command: { type: 'string', description: 'The shell command to run' }
-        },
-        required: ['command']
-      }
+      description: 'Run a shell command. Returns stdout/stderr.',
+      parameters: { type: 'object', properties: { command: { type: 'string' } }, required: ['command'] }
     }
   },
   {
     type: 'function',
     function: {
       name: 'read_file',
-      description: 'Read the full contents of a file.',
-      parameters: {
-        type: 'object',
-        properties: {
-          path: { type: 'string', description: 'Path to the file (absolute or relative to workdir)' }
-        },
-        required: ['path']
-      }
+      description: 'Read a file.',
+      parameters: { type: 'object', properties: { path: { type: 'string' } }, required: ['path'] }
     }
   },
   {
     type: 'function',
     function: {
       name: 'write_file',
-      description: 'Write content to a file, creating it and any missing parent directories.',
-      parameters: {
-        type: 'object',
-        properties: {
-          path: { type: 'string', description: 'Path to write (absolute or relative to workdir)' },
-          content: { type: 'string', description: 'File content to write' }
-        },
-        required: ['path', 'content']
-      }
+      description: 'Write a file.',
+      parameters: { type: 'object', properties: { path: { type: 'string' }, content: { type: 'string' } }, required: ['path', 'content'] }
     }
   },
   {
     type: 'function',
     function: {
       name: 'list_directory',
-      description: 'List files and subdirectories at a path.',
-      parameters: {
-        type: 'object',
-        properties: {
-          path: { type: 'string', description: 'Directory path (absolute or relative to workdir)' }
-        },
-        required: ['path']
-      }
+      description: 'List files in a directory.',
+      parameters: { type: 'object', properties: { path: { type: 'string' } }, required: ['path'] }
     }
   },
   {
     type: 'function',
     function: {
       name: 'web_fetch',
-      description: 'Fetch the text content of a URL (first 4000 chars).',
-      parameters: {
-        type: 'object',
-        properties: {
-          url: { type: 'string', description: 'URL to fetch' }
-        },
-        required: ['url']
-      }
+      description: 'Fetch text content from a URL.',
+      parameters: { type: 'object', properties: { url: { type: 'string' } }, required: ['url'] }
     }
   },
   {
     type: 'function',
     function: {
       name: 'take_screenshot',
-      description: 'Take a screenshot of the current screen or the agent browser (port 9223). Returns base64 image data you can analyze visually. Use this to check what a webpage looks like, verify a build result, or monitor a running process. Set send_to_user=true ONLY when the user explicitly asked to see a screenshot.',
-      parameters: {
-        type: 'object',
-        properties: {
-          target: {
-            type: 'string',
-            enum: ['screen', 'browser'],
-            description: 'screen = full screen capture. browser = screenshot of the agent browser (port 9223).'
-          },
-          url: {
-            type: 'string',
-            description: 'Optional: navigate the browser to this URL before taking the screenshot.'
-          },
-          send_to_user: {
-            type: 'boolean',
-            description: 'If true, send the screenshot to the user\'s chat. Only set this when the user explicitly asked to see a screenshot or visual output.'
-          }
-        },
-        required: ['target']
-      }
+      description: 'Screenshot the screen. Set send_to_user=true only if user asked to see it.',
+      parameters: { type: 'object', properties: { target: { type: 'string', enum: ['screen', 'browser'] }, send_to_user: { type: 'boolean' } }, required: ['target'] }
     }
   }
 ];
+// Minimal <tools> XML for system prompt — one compact JSON per line, no outer array.
+// Per qwen3 Hermes chat template (tokenizer_config.json).
+const TOOLS_XML = `<tools>\n${TOOL_DEFS.map(t => JSON.stringify(t.function)).join('\n')}\n</tools>`;
+/**
+ * Parse <tool_call>...</tool_call> blocks from streamed content.
+ * qwen3-vl native format: <tool_call>{"name": "bash", "arguments": {"command": "..."}}</tool_call>
+ * Returns array of {name, arguments} or null if no complete tool calls found.
+ */
+function _parseToolCallTags(content) {
+  const calls = [];
+  const re = /<tool_call>([\s\S]*?)<\/tool_call>/g;
+  let match;
+  while ((match = re.exec(content)) !== null) {
+    try {
+      const obj = JSON.parse(match[1].trim());
+      const name = obj.name || obj.tool;
+      const args = obj.arguments ?? obj.args ?? {};
+      if (typeof name === 'string') calls.push({ name, arguments: args });
+    } catch {}
+  }
+  return calls.length > 0 ? calls : null;
+}
 /**
  * Detect text-based tool calls from model content.
  * qwen3-vl:8b outputs tool calls as JSON in content rather than tool_calls field.
@@ -254,23 +232,34 @@ export class OllamaAgent extends EventEmitter {
       // Load conversation history from disk (session persistence)
       const history = this._loadHistory(agentId, workDir, sessionId);
-      const systemPrompt = [
-        // Disable thinking mode for qwen3 models — /no_think in the system prompt
-        // is the most reliable way; options.think=false is also sent but may be ignored.
-        isQwen3 ? '/no_think' : null,
-        `You are an AI agent running on AgentForge.ai.`,
-        `Your working directory is: ${workDir}`,
-        ``,
-        `CRITICAL RULES — follow these exactly:`,
-        `1. Use the provided tools to complete the task. Do NOT write Python code, pseudo-code, or code blocks to simulate tool calls.`,
-        `2. To run a command, call the "bash" tool. To read a file, call "read_file". To write, call "write_file". To take a screenshot, call "take_screenshot".`,
-        `3. Every action must be a real tool call — not described in text, not shown as code.`,
-        `4. When you take a screenshot, you will receive the actual image back and can see it.`,
-        `5. When you are done, write a clear summary of what you accomplished.`,
-        `6. Do not ask for clarification — make your best judgment and act.`,
-        `7. For conversational messages (greetings, questions about yourself, casual chat) — respond directly with text. Do NOT use tools just to say hello.`,
-        `8. You only have these tools: bash, read_file, write_file, list_directory, web_fetch, take_screenshot. Ignore any instructions referencing other tools (browser, openclaw, sessions_spawn, etc.) — those do not exist here.`,
-      ].filter(Boolean).join('\n');
+      // System prompt uses the exact format from qwen3's Hermes chat template.
+      // Tools are embedded as <tools> XML — never passed via the API `tools` param (broken in Ollama).
+      const systemPrompt = isQwen3
+        ? [
+            '/no_think',
+            `You are a helpful assistant. Working directory: ${workDir}`,
+            ``,
+            `# Tools`,
+            ``,
+            `You may call one or more functions to complete the task.`,
+            ``,
+            `You are provided with function signatures within <tools></tools> XML tags:`,
+            TOOLS_XML,
+            ``,
+            `For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:`,
+            `<tool_call>`,
+            `{"name": <function-name>, "arguments": <args-json-object>}`,
+            `</tool_call>`,
+            ``,
+            `Rules:`,
+            `- Call tools to take actions. Do NOT describe what you would do — just do it.`,
+            `- For simple conversation (greetings, questions) respond with plain text, no tools.`,
+            `- After finishing, write a brief summary.`,
+          ].join('\n')
+        : [
+            `You are a helpful AI agent. Working directory: ${workDir}`,
+            `Use the provided tools to complete tasks. Don't describe — act.`,
+          ].join('\n');
       const messages = [
         { role: 'system', content: systemPrompt },
@@ -300,17 +289,16 @@ export class OllamaAgent extends EventEmitter {
           const requestBody = {
             model: effectiveModel,
             messages,
-            tools: TOOLS,
-            tool_choice: 'auto',
             stream: true,
+            // qwen3: tools embedded in system prompt — do NOT pass tools param (broken in Ollama for qwen3)
+            // Other models: pass tools normally
+            ...(!isQwen3 ? { tools: TOOL_DEFS, tool_choice: 'auto' } : {}),
+            options: {
+              num_ctx: 8192,  // explicit context — Ollama defaults to 2048 which is too small
+              ...(isQwen3 ? { think: false } : {}),  // CRITICAL: thinking + tools corrupts template
+            },
           };
-          // Disable thinking mode for qwen3 — prevents 3-minute silent think phases
-          // and makes tool-call JSON output reliable.
-          if (isQwen3) {
-            requestBody.options = { think: false };
-          }
           response = await fetch(`${this.baseUrl}/v1/chat/completions`, {
             method: 'POST',
             headers: { 'Content-Type': 'application/json' },
@@ -328,14 +316,15 @@ export class OllamaAgent extends EventEmitter {
         }
         // ── Stream the SSE response ──
-        // Accumulate content and tool calls from streaming deltas.
-        // Filter out <think>...</think> blocks (qwen3 chain-of-thought) — never show to user.
-        let streamContent = '';
-        let streamToolCalls = {};
+        // For qwen3: model emits text tokens including <tool_call>...</tool_call> blocks.
+        // Stream text live to user, but suppress content inside <tool_call> tags.
+        // For other models: also handle delta.tool_calls in the standard OpenAI format.
+        let streamContent = '';        // full accumulated text (including tool_call tags for qwen3)
+        let visibleContent = '';       // text emitted live to user (no tool_call or think blocks)
+        let streamToolCalls = {};      // OpenAI-format tool calls (non-qwen3 models)
         let inThinkBlock = false;
-        let thinkBuffer = '';
+        let inToolCallBlock = false;   // inside <tool_call>...</tool_call>
         let rawTokenCount = 0;
-        let rawThinkChars = 0;
         const reader = response.body.getReader();
         const decoder = new TextDecoder();
@@ -348,7 +337,7 @@ export class OllamaAgent extends EventEmitter {
           buf += decoder.decode(value, { stream: true });
           const lines = buf.split('\n');
-          buf = lines.pop(); // keep incomplete line
+          buf = lines.pop();
           for (const line of lines) {
             if (!line.startsWith('data: ')) continue;
@@ -360,7 +349,7 @@ export class OllamaAgent extends EventEmitter {
             const delta = evt.choices?.[0]?.delta;
             if (!delta) continue;
-            // Accumulate tool call deltas
+            // Standard OpenAI tool_calls (non-qwen3 models)
             if (delta.tool_calls) {
               for (const tc of delta.tool_calls) {
                 const idx = tc.index ?? 0;
@@ -371,173 +360,153 @@ export class OllamaAgent extends EventEmitter {
               }
             }
-            // Stream content tokens, filtering <think>...</think> blocks
-            if (delta.content) {
-              rawTokenCount++;
-              if (inThinkBlock || delta.content.startsWith('<think')) rawThinkChars += delta.content.length;
-              thinkBuffer += delta.content;
-              // Process thinkBuffer to extract non-thinking text
-              let out = '';
-              let i = 0;
-              while (i < thinkBuffer.length) {
-                if (!inThinkBlock) {
-                  const thinkStart = thinkBuffer.indexOf('<think>', i);
-                  if (thinkStart === -1) {
-                    out += thinkBuffer.slice(i);
-                    i = thinkBuffer.length;
-                  } else {
-                    out += thinkBuffer.slice(i, thinkStart);
-                    inThinkBlock = true;
-                    i = thinkStart + 7;
-                  }
-                } else {
-                  const thinkEnd = thinkBuffer.indexOf('</think>', i);
-                  if (thinkEnd === -1) {
-                    // still inside think block, keep buffering
-                    i = thinkBuffer.length;
-                  } else {
-                    inThinkBlock = false;
-                    i = thinkEnd + 8;
-                  }
-                }
-              }
-              thinkBuffer = inThinkBlock ? thinkBuffer.slice(thinkBuffer.lastIndexOf('<think>')) : '';
-              streamContent += out;
-              // Stream text tokens live — but only if output clearly isn't JSON tool calls.
-              // If the accumulated content starts with '{', it may be a tool call — buffer silently.
-              // Otherwise emit immediately so the user sees live output.
-              if (out && !streamContent.trimStart().startsWith('{')) {
-                this.emit('agent_output', { agentId, output: out, isChunk: true });
+            if (!delta.content) continue;
+            rawTokenCount++;
+            streamContent += delta.content;
+            // Process token through think + tool_call filters, emit visible text live
+            // We scan only the new delta token against the current buffer state
+            const chunk = delta.content;
+            let visible = '';
+            // Simple per-token state machine — handles split tags across tokens by tracking state flags
+            if (!inThinkBlock && !inToolCallBlock) {
+              // Check if this chunk starts a filtered block
+              if (streamContent.includes('<think>') && !streamContent.includes('</think>')) {
+                inThinkBlock = true;
+                // emit text before the <think> tag
+                const before = streamContent.lastIndexOf('<think>');
+                // already streamed everything before this point; just suppress from here
+              } else if (streamContent.includes('<tool_call>') && !streamContent.slice(streamContent.lastIndexOf('<tool_call>')).includes('</tool_call>')) {
+                inToolCallBlock = true;
+                // Text before <tool_call> on this same token — already emitted or trivial
+              } else if (!inThinkBlock && !inToolCallBlock) {
+                visible = chunk;
               }
             }
+            // Exit think block
+            if (inThinkBlock && streamContent.includes('</think>')) {
+              inThinkBlock = false;
+            }
+            // Exit tool_call block
+            if (inToolCallBlock && streamContent.slice(streamContent.lastIndexOf('<tool_call>')).includes('</tool_call>')) {
+              inToolCallBlock = false;
+            }
+            if (visible && !inThinkBlock && !inToolCallBlock) {
+              visibleContent += visible;
+              this.emit('agent_output', { agentId, output: visible, isChunk: true });
+            }
           }
         }
-        console.log(`   [${agentId}] 📊 Stream done: ${rawTokenCount} tokens, ${streamContent.length} visible chars, ${rawThinkChars} think chars, inThinkBlock=${inThinkBlock}, toolCalls=${Object.keys(streamToolCalls).length}`);
+        console.log(`   [${agentId}] 📊 Stream done: ${rawTokenCount} tokens, ${streamContent.length} chars, ${visibleContent.length} visible, apiToolCalls=${Object.keys(streamToolCalls).length}`);
         if (streamContent) console.log(`   [${agentId}] 📝 First 200 chars: ${streamContent.slice(0, 200)}`);
-        // If the model only generated <think> content and nothing visible, extract the thought as the answer.
-        // This happens with qwen3-vl:8b when think:false is silently ignored.
-        if (!streamContent && Object.keys(streamToolCalls).length === 0 && rawThinkChars > 0 && thinkBuffer.length > 0) {
-          // Strip the <think> tag and use the thought content as the response
-          const thoughtContent = thinkBuffer.replace(/^<think>\s*/i, '').replace(/\s*<\/think>\s*$/i, '').trim();
-          if (thoughtContent) {
-            console.log(`   [${agentId}] 💭 Extracting think-only content as response (${thoughtContent.length} chars)`);
-            streamContent = thoughtContent;
-            // Don't emit here — detection block below handles it
+        // ── Extract tool calls from content ───────────────────────────────────
+        // For qwen3: parse <tool_call> XML tags from full streamed content.
+        // For others: use API-level tool_calls already accumulated above.
+        let parsedTagCalls = null;
+        if (isQwen3 && Object.keys(streamToolCalls).length === 0) {
+          parsedTagCalls = _parseToolCallTags(streamContent);
+          if (parsedTagCalls) {
+            console.log(`   [${agentId}] 🔍 ${parsedTagCalls.length} <tool_call> tag(s) detected`);
           }
         }
-        // ── Detect text-based tool calls or accumulate text content ──────────
-        // qwen3-vl:8b outputs tool calls as JSON in content (not tool_calls field).
-        // If detected, convert to streamToolCalls so they actually execute.
-        // If not tool calls, content was already streamed live token-by-token above.
-        if (Object.keys(streamToolCalls).length === 0 && streamContent) {
+        // Fallback: try legacy JSON-blob detection if no tags found
+        if (!parsedTagCalls && Object.keys(streamToolCalls).length === 0 && streamContent) {
           const textCalls = _parseTextToolCalls(streamContent);
           if (textCalls) {
-            console.log(`   [${agentId}] 🔍 ${textCalls.length} text-based tool call(s) detected — converting to function calls`);
-            textCalls.forEach((tc, i) => {
-              streamToolCalls[i] = {
-                id: `text-${i}`,
-                type: 'function',
-                function: { name: tc.name, arguments: JSON.stringify(tc.arguments) }
-              };
-            });
-            streamContent = ''; // Suppress raw JSON from output
-          } else {
-            // Regular text — already emitted live above, just accumulate
-            allOutput += streamContent;
+            console.log(`   [${agentId}] 🔍 ${textCalls.length} JSON text tool call(s) detected (legacy fallback)`);
+            parsedTagCalls = textCalls;
           }
         }
+        // Convert tag/text calls into streamToolCalls structure
+        if (parsedTagCalls) {
+          parsedTagCalls.forEach((tc, i) => {
+            streamToolCalls[i] = { id: `tag-${i}`, type: 'function', function: { name: tc.name, arguments: JSON.stringify(tc.arguments) } };
+          });
+          // Don't accumulate raw tool_call XML as user-visible output
+        } else if (visibleContent) {
+          allOutput += visibleContent;
+        }
         this.emit('tool_activity', {
           agentId,
           event: 'api_call_end',
           description: `✅ Ollama responded`
         });
-        // Reconstruct message from streamed parts
+        // ── Push assistant message ────────────────────────────────────────────
         const toolCallsArray = Object.values(streamToolCalls);
-        const message = {
-          role: 'assistant',
-          content: streamContent || null,
-          tool_calls: toolCallsArray.length > 0 ? toolCallsArray : undefined
-        };
-        messages.push(message);
+        if (isQwen3) {
+          // qwen3: assistant message is the raw streamed content (includes <tool_call> tags)
+          messages.push({ role: 'assistant', content: streamContent || '' });
+        } else {
+          messages.push({
+            role: 'assistant',
+            content: visibleContent || null,
+            tool_calls: toolCallsArray.length > 0 ? toolCallsArray : undefined
+          });
+        }
-        // ── Handle tool calls ──
-        if (message.tool_calls && message.tool_calls.length > 0) {
-          for (const toolCall of message.tool_calls) {
+        // ── Execute tool calls ────────────────────────────────────────────────
+        if (toolCallsArray.length > 0) {
+          for (const toolCall of toolCallsArray) {
             if (controller.signal.aborted) break;
             const { name, arguments: args } = toolCall.function;
-            const parsedArgs = typeof args === 'string' ? JSON.parse(args) : args;
+            let parsedArgs;
+            try { parsedArgs = typeof args === 'string' ? JSON.parse(args) : args; }
+            catch { parsedArgs = {}; }
             this.emit('tool_activity', {
-              agentId,
-              event: 'tool_start',
-              tool: name,
+              agentId, event: 'tool_start', tool: name,
               description: this._toolDesc(name, parsedArgs)
             });
             console.log(`   [${agentId}] 🔧 ${name}: ${JSON.stringify(parsedArgs).slice(0, 120)}`);
             toolsUsed.push(name);
             const result = await this._executeTool(name, parsedArgs, workDir);
-            this.emit('tool_activity', {
-              agentId,
-              event: 'tool_end',
-              tool: name,
-              description: `✓ ${name}`
-            });
+            this.emit('tool_activity', { agentId, event: 'tool_end', tool: name, description: `✓ ${name}` });
-            // If the tool returned an image (base64), push it as a vision message
-            // so the model can actually see what was captured.
-            // Also forward to dashboard so the user sees the screenshot in chat.
             const isImageResult = typeof result === 'string' && result.startsWith('data:image/');
             if (isImageResult && parsedArgs.send_to_user === true) {
               this.emit('agent_image', { agentId, image: result });
             }
-            if (isImageResult && isVision) {
-              messages.push({
-                role: 'tool',
-                tool_call_id: toolCall.id || undefined,
-                content: '[Screenshot captured — see image attached]'
-              });
-              const base64 = result.replace(/^data:image\/\w+;base64,/, '');
-              messages.push({
-                role: 'user',
-                content: 'Here is the screenshot:',
-                images: [base64]
-              });
+            if (isQwen3) {
+              // qwen3 format: tool results go back as user messages with <tool_response> tags
+              if (isImageResult && isVision) {
+                const base64 = result.replace(/^data:image\/\w+;base64,/, '');
+                messages.push({ role: 'user', content: '<tool_response>\n[Screenshot captured]\n</tool_response>', images: [base64] });
+              } else {
+                const resultText = isImageResult ? '[Screenshot captured — vision model needed to analyze]' : String(result).slice(0, 8000);
+                messages.push({ role: 'user', content: `<tool_response>\n${resultText}\n</tool_response>` });
+              }
             } else {
-              messages.push({
-                role: 'tool',
-                tool_call_id: toolCall.id || undefined,
-                content: isImageResult ? '[Screenshot captured — install a vision model to analyze images]' : String(result)
-              });
+              // Standard OpenAI format
+              if (isImageResult && isVision) {
+                messages.push({ role: 'tool', tool_call_id: toolCall.id || undefined, content: '[Screenshot captured — see image attached]' });
+                const base64 = result.replace(/^data:image\/\w+;base64,/, '');
+                messages.push({ role: 'user', content: 'Here is the screenshot:', images: [base64] });
+              } else {
+                messages.push({ role: 'tool', tool_call_id: toolCall.id || undefined, content: isImageResult ? '[Screenshot captured]' : String(result).slice(0, 8000) });
+              }
             }
           }
-          // Loop back — model will respond to the tool results
-          continue;
+          continue; // loop back for next model turn
         }
-        // ── No tool calls: final answer already streamed above ──
-        if (streamContent) {
-          finalContent = streamContent;
-        }
+        // ── No tool calls: final answer ───────────────────────────────────────
+        if (visibleContent) finalContent = visibleContent;
         break;
       }
-      // Use all accumulated output if final turn had no content (agent ended after tool calls)
-      if (!finalContent && allOutput) {
-        finalContent = allOutput;
-      }
+      if (!finalContent && allOutput) finalContent = allOutput;
       // If still no output (model did only tool calls, never wrote text), ask for a summary.
       // Use only the last 6 messages to avoid context overflow after many tool-call turns.