npm - @aman_asmuei/aman-agent - Versions diffs - 0.7.7 → 0.8.0 - Mend

@aman_asmuei/aman-agent 0.7.7 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -392,7 +392,33 @@ Default budget: 8,000 tokens. Override with `--budget`.
 |:---|:---|:---|:---|
 | **Anthropic** | Claude Sonnet 4.6, Opus 4.6, Haiku 4.5 | Full | Full (with tools) |
 | **OpenAI** | GPT-4o, GPT-4o Mini, o3 | Full | Full (with tools) |
-| **Ollama** | Llama, Mistral, Gemma, any local model | Text only | Full |
+| **Ollama** | Llama, Mistral, Gemma, any local model | Model-dependent | Full (with tools) |
+### Image Support (Vision)
+Reference image files or URLs in your message and they'll be sent as vision content to the LLM:
+```
+You > What's in this screenshot? ~/Desktop/screenshot.png
+  [attached image: screenshot.png (245.3KB)]
+```
+**Supported formats:** `.png`, `.jpg`, `.jpeg`, `.gif`, `.webp`, `.bmp`
+**Image URLs** are also supported — paste any `https://...png` URL and it will be fetched and attached.
+**Multiple files** can be referenced in a single message (images, text files, and documents together).
+**Size limit:** 20MB per image.
+**Vision model requirements:**
+| Provider | Vision Models |
+|:---|:---|
+| **Anthropic** | All Claude models (Sonnet, Opus, Haiku) |
+| **OpenAI** | GPT-4o, GPT-4o Mini |
+| **Ollama** | LLaVA, Llama 3.2 Vision, Moondream, BakLLaVA |
+Non-vision models will receive the image but may not be able to interpret it.
 ---

package/dist/index.js CHANGED Viewed

@@ -147,6 +147,16 @@ function toAnthropicMessages(messages) {
         if (block.type === "text") {
           return { type: "text", text: block.text };
         }
+        if (block.type === "image") {
+          return {
+            type: "image",
+            source: {
+              type: "base64",
+              media_type: block.source.media_type,
+              data: block.source.data
+            }
+          };
+        }
         if (block.type === "tool_use") {
           return {
             type: "tool_use",
@@ -311,8 +321,26 @@ function toOpenAIMessages(systemPrompt, messages) {
           }
         }
       } else {
-        const text2 = m.content.map((b) => "text" in b ? b.text : "").join("");
-        result.push({ role: "user", content: text2 });
+        const hasImages = m.content.some((b) => b.type === "image");
+        if (hasImages) {
+          const parts = [];
+          for (const b of m.content) {
+            if (b.type === "text") {
+              parts.push({ type: "text", text: b.text });
+            } else if (b.type === "image") {
+              parts.push({
+                type: "image_url",
+                image_url: {
+                  url: `data:${b.source.media_type};base64,${b.source.data}`
+                }
+              });
+            }
+          }
+          result.push({ role: "user", content: parts });
+        } else {
+          const text2 = m.content.map((b) => "text" in b ? b.text : "").join("");
+          result.push({ role: "user", content: text2 });
+        }
       }
     }
   }
@@ -415,6 +443,74 @@ function createOpenAIClient(apiKey, model) {
 // src/llm/ollama.ts
 import OpenAI2 from "openai";
+function toOllamaMessages(systemPrompt, messages) {
+  const result = [
+    { role: "system", content: systemPrompt }
+  ];
+  for (const m of messages) {
+    if (typeof m.content === "string") {
+      result.push({
+        role: m.role,
+        content: m.content
+      });
+    } else if (m.role === "assistant") {
+      const textParts = m.content.filter((b) => b.type === "text");
+      const toolUseParts = m.content.filter((b) => b.type === "tool_use");
+      const text2 = textParts.map((b) => "text" in b ? b.text : "").join("");
+      if (toolUseParts.length > 0) {
+        result.push({
+          role: "assistant",
+          content: text2 || null,
+          tool_calls: toolUseParts.map((b) => ({
+            id: "id" in b ? b.id : "",
+            type: "function",
+            function: {
+              name: "name" in b ? b.name : "",
+              arguments: JSON.stringify("input" in b ? b.input : {})
+            }
+          }))
+        });
+      } else {
+        result.push({ role: "assistant", content: text2 });
+      }
+    } else if (m.role === "user") {
+      const toolResults = m.content.filter((b) => b.type === "tool_result");
+      if (toolResults.length > 0) {
+        for (const tr of toolResults) {
+          if (tr.type === "tool_result") {
+            result.push({
+              role: "tool",
+              tool_call_id: tr.tool_use_id,
+              content: tr.content
+            });
+          }
+        }
+      } else {
+        const hasImages = m.content.some((b) => b.type === "image");
+        if (hasImages) {
+          const parts = [];
+          for (const b of m.content) {
+            if (b.type === "text") {
+              parts.push({ type: "text", text: b.text });
+            } else if (b.type === "image") {
+              parts.push({
+                type: "image_url",
+                image_url: {
+                  url: `data:${b.source.media_type};base64,${b.source.data}`
+                }
+              });
+            }
+          }
+          result.push({ role: "user", content: parts });
+        } else {
+          const text2 = m.content.map((b) => "text" in b ? b.text : "").join("");
+          result.push({ role: "user", content: text2 });
+        }
+      }
+    }
+  }
+  return result;
+}
 function createOllamaClient(model, baseURL) {
   const client = new OpenAI2({
     baseURL: baseURL || "http://localhost:11434/v1",
@@ -422,28 +518,83 @@ function createOllamaClient(model, baseURL) {
     // Ollama doesn't require a real key
   });
   return {
-    async chat(systemPrompt, messages, onChunk, _tools) {
-      let fullText = "";
+    async chat(systemPrompt, messages, onChunk, tools) {
+      const ollamaMessages = toOllamaMessages(systemPrompt, messages);
+      const hasTools = tools && tools.length > 0;
       try {
-        const stream = await client.chat.completions.create({
+        let fullText = "";
+        const toolCallAccumulators = /* @__PURE__ */ new Map();
+        const createParams = {
           model,
           max_tokens: 8192,
-          messages: [
-            { role: "system", content: systemPrompt },
-            ...messages.map((m) => ({
-              role: m.role,
-              content: typeof m.content === "string" ? m.content : m.content.filter((b) => b.type === "text").map((b) => "text" in b ? b.text : "").join("")
-            }))
-          ],
+          messages: ollamaMessages,
           stream: true
-        });
+        };
+        if (hasTools) {
+          createParams.tools = tools.map((t) => ({
+            type: "function",
+            function: {
+              name: t.name,
+              description: t.description,
+              parameters: t.input_schema
+            }
+          }));
+        }
+        const stream = await client.chat.completions.create(
+          createParams
+        );
         for await (const chunk of stream) {
-          const text2 = chunk.choices[0]?.delta?.content || "";
-          if (text2) {
-            fullText += text2;
-            onChunk({ type: "text", text: text2 });
+          const delta = chunk.choices[0]?.delta;
+          if (!delta) continue;
+          if (delta.content) {
+            fullText += delta.content;
+            onChunk({ type: "text", text: delta.content });
+          }
+          if (delta.tool_calls) {
+            for (const tc of delta.tool_calls) {
+              const idx = tc.index;
+              let acc = toolCallAccumulators.get(idx);
+              if (!acc) {
+                acc = { id: "", name: "", arguments: "" };
+                toolCallAccumulators.set(idx, acc);
+              }
+              if (tc.id) {
+                acc.id = tc.id;
+              }
+              if (tc.function?.name) {
+                acc.name = tc.function.name;
+              }
+              if (tc.function?.arguments) {
+                acc.arguments += tc.function.arguments;
+              }
+            }
           }
         }
+        const toolUses = Array.from(toolCallAccumulators.entries()).sort(([a], [b]) => a - b).map(([, acc]) => ({
+          id: acc.id,
+          name: acc.name,
+          input: JSON.parse(acc.arguments || "{}")
+        }));
+        onChunk({ type: "done" });
+        if (toolUses.length > 0) {
+          const contentBlocks = [
+            ...fullText ? [{ type: "text", text: fullText }] : [],
+            ...toolUses.map((tu) => ({
+              type: "tool_use",
+              id: tu.id,
+              name: tu.name,
+              input: tu.input
+            }))
+          ];
+          return {
+            message: { role: "assistant", content: contentBlocks },
+            toolUses
+          };
+        }
+        return {
+          message: { role: "assistant", content: fullText },
+          toolUses: []
+        };
       } catch (error) {
         if (error instanceof Error && error.message.includes("ECONNREFUSED")) {
           throw new Error(
@@ -452,11 +603,6 @@ function createOllamaClient(model, baseURL) {
         }
         throw error;
       }
-      onChunk({ type: "done" });
-      return {
-        message: { role: "assistant", content: fullText },
-        toolUses: []
-      };
     }
   };
 }
@@ -2081,106 +2227,180 @@ ${wfMatch.steps}
       }
     }
     await trimConversation(messages, client);
-    let enrichedInput = input;
-    const filePathMatch = input.match(/(\/[\w./-]+|~\/[\w./-]+)/);
-    if (filePathMatch) {
-      let filePath = filePathMatch[1];
+    const textExts = /* @__PURE__ */ new Set([
+      ".txt",
+      ".md",
+      ".json",
+      ".js",
+      ".ts",
+      ".jsx",
+      ".tsx",
+      ".py",
+      ".html",
+      ".css",
+      ".yml",
+      ".yaml",
+      ".toml",
+      ".xml",
+      ".csv",
+      ".sh",
+      ".bash",
+      ".zsh",
+      ".env",
+      ".cfg",
+      ".ini",
+      ".log",
+      ".sql",
+      ".graphql",
+      ".rs",
+      ".go",
+      ".java",
+      ".rb",
+      ".php",
+      ".c",
+      ".cpp",
+      ".h",
+      ".swift",
+      ".kt",
+      ".r",
+      ".lua"
+    ]);
+    const imageExts = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp"]);
+    const docExts = /* @__PURE__ */ new Set([".docx", ".doc", ".pdf", ".pptx", ".ppt", ".xlsx", ".xls", ".odt", ".rtf", ".epub"]);
+    const mimeMap = {
+      ".png": "image/png",
+      ".jpg": "image/jpeg",
+      ".jpeg": "image/jpeg",
+      ".gif": "image/gif",
+      ".webp": "image/webp",
+      ".bmp": "image/png"
+    };
+    const maxImageBytes = 20 * 1024 * 1024;
+    let textContent = input;
+    const imageBlocks = [];
+    const filePathMatches = [...input.matchAll(/(\/[\w./-]+|~\/[\w./-]+)/g)];
+    for (const match of filePathMatches) {
+      let filePath = match[1];
       if (filePath.startsWith("~/")) {
         filePath = path7.join(os7.homedir(), filePath.slice(2));
       }
-      if (fs7.existsSync(filePath) && fs7.statSync(filePath).isFile()) {
-        const ext = path7.extname(filePath).toLowerCase();
-        const textExts = /* @__PURE__ */ new Set([
-          ".txt",
-          ".md",
-          ".json",
-          ".js",
-          ".ts",
-          ".jsx",
-          ".tsx",
-          ".py",
-          ".html",
-          ".css",
-          ".yml",
-          ".yaml",
-          ".toml",
-          ".xml",
-          ".csv",
-          ".sh",
-          ".bash",
-          ".zsh",
-          ".env",
-          ".cfg",
-          ".ini",
-          ".log",
-          ".sql",
-          ".graphql",
-          ".rs",
-          ".go",
-          ".java",
-          ".rb",
-          ".php",
-          ".c",
-          ".cpp",
-          ".h",
-          ".swift",
-          ".kt",
-          ".r",
-          ".lua"
-        ]);
-        if (textExts.has(ext) || ext === "") {
-          try {
-            const content = fs7.readFileSync(filePath, "utf-8");
-            const maxChars = 5e4;
-            const trimmed = content.length > maxChars ? content.slice(0, maxChars) + `
+      if (!fs7.existsSync(filePath) || !fs7.statSync(filePath).isFile()) continue;
+      const ext = path7.extname(filePath).toLowerCase();
+      if (imageExts.has(ext)) {
+        try {
+          const stat = fs7.statSync(filePath);
+          if (stat.size > maxImageBytes) {
+            process.stdout.write(pc3.yellow(`  [skipped: ${path7.basename(filePath)} \u2014 exceeds 20MB limit]
+`));
+            continue;
+          }
+          const data = fs7.readFileSync(filePath).toString("base64");
+          const mediaType = mimeMap[ext] || "image/png";
+          imageBlocks.push({
+            type: "image",
+            source: { type: "base64", media_type: mediaType, data }
+          });
+          process.stdout.write(pc3.dim(`  [attached image: ${path7.basename(filePath)} (${(stat.size / 1024).toFixed(1)}KB)]
+`));
+        } catch {
+          process.stdout.write(pc3.dim(`  [could not read image: ${filePath}]
+`));
+        }
+      } else if (textExts.has(ext) || ext === "") {
+        try {
+          const content = fs7.readFileSync(filePath, "utf-8");
+          const maxChars = 5e4;
+          const trimmed = content.length > maxChars ? content.slice(0, maxChars) + `
 [... truncated, ${content.length - maxChars} chars remaining]` : content;
-            enrichedInput = `${input}
+          textContent += `
 <file path="${filePath}" size="${content.length} chars">
 ${trimmed}
 </file>`;
-            process.stdout.write(pc3.dim(`  [attached: ${path7.basename(filePath)} (${(content.length / 1024).toFixed(1)}KB)]
+          process.stdout.write(pc3.dim(`  [attached: ${path7.basename(filePath)} (${(content.length / 1024).toFixed(1)}KB)]
 `));
-          } catch {
-            process.stdout.write(pc3.dim(`  [could not read: ${filePath}]
+        } catch {
+          process.stdout.write(pc3.dim(`  [could not read: ${filePath}]
 `));
-          }
-        } else if ([".docx", ".doc", ".pdf", ".pptx", ".ppt", ".xlsx", ".xls", ".odt", ".rtf", ".epub"].includes(ext)) {
-          if (mcpManager) {
-            try {
-              process.stdout.write(pc3.dim(`  [converting: ${path7.basename(filePath)}...]
+        }
+      } else if (docExts.has(ext)) {
+        if (mcpManager) {
+          try {
+            process.stdout.write(pc3.dim(`  [converting: ${path7.basename(filePath)}...]
 `));
-              const converted = await mcpManager.callTool("doc_convert", { path: filePath });
-              if (converted && !converted.startsWith("Error") && !converted.includes("Could not convert")) {
-                enrichedInput = `${input}
+            const converted = await mcpManager.callTool("doc_convert", { path: filePath });
+            if (converted && !converted.startsWith("Error") && !converted.includes("Could not convert")) {
+              textContent += `
 <file path="${filePath}" format="${ext}">
 ${converted.slice(0, 5e4)}
 </file>`;
-                process.stdout.write(pc3.dim(`  [attached: ${path7.basename(filePath)} (converted from ${ext})]
+              process.stdout.write(pc3.dim(`  [attached: ${path7.basename(filePath)} (converted from ${ext})]
 `));
-              } else {
-                enrichedInput = `${input}
+            } else {
+              textContent += `
 <file-error path="${filePath}">
 ${converted}
 </file-error>`;
-                process.stdout.write(pc3.yellow(`  [conversion note: ${converted.split("\n")[0]}]
-`));
-              }
-            } catch {
-              process.stdout.write(pc3.dim(`  [could not convert: ${path7.basename(filePath)}]
+              process.stdout.write(pc3.yellow(`  [conversion note: ${converted.split("\n")[0]}]
 `));
             }
-          } else {
-            process.stdout.write(pc3.yellow(`  Binary file (${ext}) \u2014 install Docling for document support: pip install docling
+          } catch {
+            process.stdout.write(pc3.dim(`  [could not convert: ${path7.basename(filePath)}]
 `));
           }
+        } else {
+          process.stdout.write(pc3.yellow(`  Binary file (${ext}) \u2014 install Docling for document support: pip install docling
+`));
         }
       }
     }
-    messages.push({ role: "user", content: enrichedInput });
+    const urlImageMatches = [...input.matchAll(/https?:\/\/\S+\.(?:png|jpg|jpeg|gif|webp)(?:\?\S*)?/gi)];
+    for (const match of urlImageMatches) {
+      const url = match[0];
+      try {
+        process.stdout.write(pc3.dim(`  [fetching image: ${url.slice(0, 60)}...]
+`));
+        const response = await fetch(url);
+        if (!response.ok) {
+          process.stdout.write(pc3.yellow(`  [could not fetch: HTTP ${response.status}]
+`));
+          continue;
+        }
+        const buffer = Buffer.from(await response.arrayBuffer());
+        if (buffer.length > maxImageBytes) {
+          process.stdout.write(pc3.yellow(`  [skipped: image URL exceeds 20MB limit]
+`));
+          continue;
+        }
+        const contentType = response.headers.get("content-type") || "";
+        let mediaType = "image/png";
+        if (contentType.includes("jpeg") || contentType.includes("jpg")) mediaType = "image/jpeg";
+        else if (contentType.includes("gif")) mediaType = "image/gif";
+        else if (contentType.includes("webp")) mediaType = "image/webp";
+        else if (contentType.includes("png")) mediaType = "image/png";
+        imageBlocks.push({
+          type: "image",
+          source: { type: "base64", media_type: mediaType, data: buffer.toString("base64") }
+        });
+        process.stdout.write(pc3.dim(`  [attached image URL: (${(buffer.length / 1024).toFixed(1)}KB)]
+`));
+      } catch {
+        process.stdout.write(pc3.dim(`  [could not fetch image: ${url}]
+`));
+      }
+    }
+    if (imageBlocks.length > 0) {
+      const blocks = [
+        { type: "text", text: textContent },
+        ...imageBlocks
+      ];
+      messages.push({ role: "user", content: blocks });
+    } else {
+      messages.push({ role: "user", content: textContent });
+    }
     let augmentedSystemPrompt = activeSystemPrompt;
     let memoryTokens = 0;
     if (mcpManager) {