npm - open-agents-ai - Versions diffs - 0.187.593 → 0.187.594 - Mend

open-agents-ai 0.187.593 → 0.187.594

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js CHANGED Viewed

@@ -503992,6 +503992,13 @@ ${cameras.join("\n")}`,
         }
         const data = readFileSync29(filePath);
         const sizeKB = Math.round(data.length / 1024);
+        const outputPath = userOutputPath ?? join55(process.cwd(), ".oa", "camera-captures", `capture-${Date.now()}.jpg`);
+        if (!userOutputPath) {
+          mkdirSync14(join55(process.cwd(), ".oa", "camera-captures"), {
+            recursive: true
+          });
+          writeFileSync15(outputPath, data);
+        }
         if (userOutputPath) {
           return {
             success: true,
@@ -504005,12 +504012,13 @@ Saved to: ${userOutputPath}`,
         } catch {
         }
         const base642 = data.toString("base64");
+        const display = `Captured ${resolution} frame from ${source} (${sizeKB}KB JPEG).
+Saved to: ${outputPath}`;
         return {
           success: true,
-          output: `Captured ${resolution} frame from ${source} (${sizeKB}KB JPEG).
-Base64 image data (use with vision tools):
-data:image/jpeg;base64,${base642}`,
+          output: display,
+          llmContent: `${display}
+[IMAGE_BASE64:image/jpeg:${base642}]`,
           durationMs: performance.now() - start2
         };
       }
@@ -531235,19 +531243,22 @@ TASK: ${task}` : task;
           web_fetch: 4,
           list_directory: 12,
           find_files: 10,
-          grep_search: 12
+          grep_search: 12,
+          camera_capture: 3
         } : loopTier === "medium" ? {
           web_search: 10,
           web_fetch: 8,
           list_directory: 18,
           find_files: 14,
-          grep_search: 18
+          grep_search: 18,
+          camera_capture: 4
         } : {
           web_search: 20,
           web_fetch: 15,
           list_directory: 30,
           find_files: 20,
-          grep_search: 30
+          grep_search: 30,
+          camera_capture: 5
         };
         for (const [tool, budget] of Object.entries(toolBudgets)) {
           toolCallBudget.set(tool, budget);
@@ -534464,6 +534475,9 @@ Respond with EXACTLY this structure before your next tool call:
                   };
                 }
               }
+              if (result.success) {
+                result = await this.offloadEmbeddedImageResult(result, tc.name, turn);
+              }
               let output = this.normalizeToolOutput(result, tc.name, tc.arguments, turn);
               if (!result.success && (this.options.modelTier === "small" || this.options.modelTier === "medium")) {
                 const recovery = this.buildRecoveryGuidance(tc.name, result.error ?? "", tc.arguments);
@@ -536881,25 +536895,14 @@ Integrate this guidance into your current approach. Continue working on the task
           turn,
           timestamp: (/* @__PURE__ */ new Date()).toISOString()
         });
-        const tmpImgPath = this.writeTempImageForOcr(mime, base642);
-        const [visionOutcome, ocrOutcome] = await Promise.allSettled([
-          this.describeImageViaVisionSubagent(imageUrl, textContent),
-          tmpImgPath ? this.extractImageOcrText(tmpImgPath) : Promise.resolve("")
-        ]);
-        const visionDesc = visionOutcome.status === "fulfilled" ? visionOutcome.value.trim() : "";
-        const ocrText = ocrOutcome.status === "fulfilled" ? ocrOutcome.value.trim() : "";
-        if (visionDesc || ocrText) {
-          const sections = [];
-          if (visionDesc)
-            sections.push(`[Image analysis]: ${visionDesc}`);
-          if (ocrText)
-            sections.push(`[OCR extracted text]: ${ocrText}`);
+        const analysis = await this.analyzeImageDataForContext(mime, base642, textContent);
+        if (analysis.contextBlock) {
           const userPrefix = textContent ? `[User added context]: ${textContent}
 ` : "[User shared an image]. ";
           messages2.push({
             role: "user",
-            content: userPrefix + sections.join("\n\n") + "\n\nIntegrate this visual information into your current approach."
+            content: userPrefix + analysis.contextBlock + "\n\nIntegrate this visual information into your current approach."
           });
           this.emit({
             type: "status",
@@ -536909,7 +536912,7 @@ Integrate this guidance into your current approach. Continue working on the task
           });
           return;
         }
-        const reason = visionOutcome.status === "rejected" ? String(visionOutcome.reason?.message ?? visionOutcome.reason) : "vision and OCR returned no text";
+        const reason = analysis.errorReason || "vision and OCR returned no text";
         this.emit({
           type: "status",
           content: `Image offload unavailable (${reason}); falling back to inline image`,
@@ -536918,6 +536921,71 @@ Integrate this guidance into your current approach. Continue working on the task
         });
         this.appendInlineImageMessage(messages2, imageUrl, textContent);
       }
+      async offloadEmbeddedImageResult(result, toolName, turn) {
+        const modelSource = result.llmContent ?? result.output;
+        const image = this.extractFirstEmbeddedImage(modelSource);
+        if (!image)
+          return result;
+        this.emit({
+          type: "status",
+          content: `${toolName}: offloading embedded image analysis outside main context`,
+          turn,
+          timestamp: (/* @__PURE__ */ new Date()).toISOString()
+        });
+        const analysis = await this.analyzeImageDataForContext(image.mime, image.base64, image.textWithoutImage.slice(0, 2e3));
+        const imageNote = analysis.contextBlock ? `${analysis.contextBlock}
+Use this image analysis. Do not repeat ${toolName} with the same arguments unless the scene has changed.` : `[Embedded image data omitted from model context; ${analysis.errorReason || "vision and OCR returned no text"}. Use any saved image path above with vision/image_read if further inspection is needed.]`;
+        return {
+          ...result,
+          llmContent: `${image.textWithoutImage.trim()}
+${imageNote}`.trim()
+        };
+      }
+      extractFirstEmbeddedImage(text) {
+        const markerPattern = /\[IMAGE_BASE64:([^:\]]+):([^\]]+)\]/;
+        const markerMatch = text.match(markerPattern);
+        if (markerMatch) {
+          const mime2 = markerMatch[1];
+          const base643 = markerMatch[2];
+          return {
+            mime: mime2,
+            base64: base643,
+            textWithoutImage: text.replace(markerPattern, `[image data omitted: ${mime2}, ${base643.length} base64 chars]`).trim()
+          };
+        }
+        const dataUrlPattern = /data:(image\/[a-zA-Z0-9.+-]+);base64,([A-Za-z0-9+/=]+)/;
+        const dataUrlMatch = text.match(dataUrlPattern);
+        if (!dataUrlMatch)
+          return null;
+        const mime = dataUrlMatch[1];
+        const base642 = dataUrlMatch[2];
+        return {
+          mime,
+          base64: base642,
+          textWithoutImage: text.replace(dataUrlPattern, `[image data omitted: ${mime}, ${base642.length} base64 chars]`).trim()
+        };
+      }
+      async analyzeImageDataForContext(mime, base642, textContent) {
+        const imageUrl = `data:${mime};base64,${base642}`;
+        const tmpImgPath = this.writeTempImageForOcr(mime, base642);
+        const [visionOutcome, ocrOutcome] = await Promise.allSettled([
+          this.describeImageViaVisionSubagent(imageUrl, textContent),
+          tmpImgPath ? this.extractImageOcrText(tmpImgPath) : Promise.resolve("")
+        ]);
+        const visionDesc = visionOutcome.status === "fulfilled" ? visionOutcome.value.trim() : "";
+        const ocrText = ocrOutcome.status === "fulfilled" ? ocrOutcome.value.trim() : "";
+        const sections = [];
+        if (visionDesc)
+          sections.push(`[Image analysis]: ${visionDesc}`);
+        if (ocrText)
+          sections.push(`[OCR extracted text]: ${ocrText}`);
+        if (sections.length > 0)
+          return { contextBlock: sections.join("\n\n") };
+        const errorReason = visionOutcome.status === "rejected" ? String(visionOutcome.reason?.message ?? visionOutcome.reason) : void 0;
+        return { contextBlock: "", errorReason };
+      }
       async describeImageViaVisionSubagent(imageUrl, textContent) {
         const visionMessages = [
           {
@@ -591936,6 +592004,36 @@ function normalizeTelegramMedia(message2) {
   }
   return void 0;
 }
+function telegramMediaIsImage(media) {
+  if (media.type === "photo") return true;
+  if (media.mimeType?.toLowerCase().startsWith("image/")) return true;
+  return /\.(png|jpe?g|gif|webp|bmp|tiff?)$/i.test(media.fileName ?? "");
+}
+function telegramImageExtension(media) {
+  const fileName = media.fileName ?? "";
+  const dotIdx = fileName.lastIndexOf(".");
+  if (dotIdx >= 0) {
+    const ext = fileName.slice(dotIdx).toLowerCase();
+    if (/^\.(png|jpe?g|gif|webp|bmp|tiff?)$/.test(ext)) return ext;
+  }
+  const mime = media.mimeType?.toLowerCase() ?? "";
+  if (mime.includes("png")) return ".png";
+  if (mime.includes("webp")) return ".webp";
+  if (mime.includes("gif")) return ".gif";
+  if (mime.includes("bmp")) return ".bmp";
+  if (mime.includes("tiff")) return ".tif";
+  return ".jpg";
+}
+function telegramImageMime(media) {
+  if (media.mimeType?.toLowerCase().startsWith("image/")) return media.mimeType;
+  const ext = telegramImageExtension(media);
+  if (ext === ".png") return "image/png";
+  if (ext === ".webp") return "image/webp";
+  if (ext === ".gif") return "image/gif";
+  if (ext === ".bmp") return "image/bmp";
+  if (ext === ".tif" || ext === ".tiff") return "image/tiff";
+  return "image/jpeg";
+}
 function normalizeTelegramUpdate(update2) {
   const sourceUpdateType = update2.guest_message ? "guest_message" : update2.message ? "message" : null;
   if (!sourceUpdateType) return null;
@@ -592794,12 +592892,22 @@ Join: ${newUrl}`);
         }
         const existing = this.subAgents.get(sessionKey);
         if (existing && !existing.aborted) {
-          this.recordChatHistory(sessionKey, { role: "user", text: msg.text, mode: "steering" });
+          let steeringText = msg.text;
+          if (msg.media) {
+            const mediaContext = await this.processMedia(msg);
+            if (mediaContext) {
+              steeringText += `
+[Media attached — processed content below]
+${mediaContext}`;
+            }
+          }
+          this.recordChatHistory(sessionKey, { role: "user", text: steeringText, mode: "steering" });
           if (existing.runner) {
-            existing.runner.injectUserMessage(msg.text);
+            existing.runner.injectUserMessage(steeringText);
             this.tuiWrite(() => renderTelegramSubAgentEvent(msg.username, "mid-conversation steering injected"));
           } else {
-            existing.pendingMessages.push(msg.text);
+            existing.pendingMessages.push(steeringText);
             this.tuiWrite(() => renderTelegramSubAgentEvent(msg.username, `queued (${existing.pendingMessages.length} pending)`));
           }
           return;
@@ -593494,8 +593602,9 @@ Todo/session id: ${sessionContext.sessionId}` : `Telegram ${isGroup ? "group" :
       async processMedia(msg) {
         if (!msg.media) return "";
         const { type, fileId, fileUniqueId, mimeType, caption } = msg.media;
+        const isImageMedia = telegramMediaIsImage(msg.media);
         let ext = ".bin";
-        if (type === "photo") ext = ".jpg";
+        if (isImageMedia) ext = telegramImageExtension(msg.media);
         else if (type === "audio" || type === "voice") ext = ".ogg";
         else if (type === "video" || type === "video_note" || type === "live_photo") ext = ".mp4";
         else if (msg.media.fileName) {
@@ -593526,23 +593635,27 @@ Todo/session id: ${sessionContext.sessionId}` : `Telegram ${isGroup ? "group" :
           username: msg.username
         });
         let description = `[${type}${caption ? `: ${caption}` : ""}]`;
-        if (type === "photo") {
+        if (isImageMedia) {
           let visionContext = "";
           try {
             const { runVisionIngress: runVisionIngress2, formatImageContextPrefix: formatImageContextPrefix2 } = await Promise.resolve().then(() => (init_vision_ingress(), vision_ingress_exports));
             const ingressResult = await runVisionIngress2(
-              { path: localPath, buffer: Buffer.from(""), mime: "image/png" },
-              ""
+              {
+                path: localPath,
+                buffer: readFileSync84(localPath),
+                mime: telegramImageMime(msg.media)
+              },
+              this.agentConfig?.model ?? ""
             );
             visionContext = formatImageContextPrefix2(ingressResult);
             cacheEntry.extractedContent = ingressResult.contextBlock;
           } catch {
           }
           if (visionContext) {
-            description = `[Photo received: ${localPath}${caption ? ` — caption: "${caption}"` : ""}
+            description = `[Image received: ${localPath}${caption ? ` — caption: "${caption}"` : ""}
 ${visionContext}]`;
           } else {
-            description = `[Photo received and saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}. You can use image_read or vision tools to analyze it if available.]`;
+            description = `[Image received and saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}. You can use image_read or vision tools to analyze it if available.]`;
           }
           try {
             await fetch("http://127.0.0.1:11435/v1/memory/ingest", {
@@ -593553,8 +593666,7 @@ ${visionContext}]`;
             });
           } catch {
           }
-        }
-        if (type === "audio" || type === "voice") {
+        } else if (type === "audio" || type === "voice") {
           let transcription = null;
           try {
             const { getListenEngine: getListenEngine2 } = await Promise.resolve().then(() => (init_listen(), listen_exports));
@@ -593580,12 +593692,10 @@ ${visionContext}]`;
             });
           } catch {
           }
-        }
-        if (type === "video" || type === "video_note" || type === "live_photo") {
+        } else if (type === "video" || type === "video_note" || type === "live_photo") {
           const label = type === "live_photo" ? "Live photo" : "Video";
           description = `[${label} received and saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}.]`;
-        }
-        if (type === "document") {
+        } else if (type === "document") {
           description = `[Document received: ${msg.media.fileName || "unnamed"}${mimeType ? ` (${mimeType})` : ""}, saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}.]`;
         }
         cacheEntry.extractedContent = description;

package/npm-shrinkwrap.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "name": "open-agents-ai",
-  "version": "0.187.593",
+  "version": "0.187.594",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "open-agents-ai",
-      "version": "0.187.593",
+      "version": "0.187.594",
       "hasInstallScript": true,
       "license": "CC-BY-NC-4.0",
       "dependencies": {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "open-agents-ai",
-  "version": "0.187.593",
+  "version": "0.187.594",
   "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
   "type": "module",
   "main": "./dist/index.js",