npm - open-agents-ai - Versions diffs - 0.187.164 → 0.187.165 - Mend

open-agents-ai 0.187.164 → 0.187.165

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/index.js +67 -1
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -257987,8 +257987,33 @@ var init_multimodal_memory = __esm({
           } catch {
           }
           if (existsSync36(imagePath)) {
-            episode.visual = { faceIds: [], faceNames: [], objects: [], imagePath };
+            episode.visual = { faceIds: [], faceNames: [], objects: [], imagePath, clipEmbedding: null };
             results.push("Photo captured");
+            try {
+              const venvPy = join50(homedir15(), ".open-agents", "vision-ml-venv", "bin", "python3");
+              if (existsSync36(venvPy)) {
+                const clipScript = `
+import json, torch
+from PIL import Image
+from transformers import CLIPProcessor, CLIPModel
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+img = Image.open("${imagePath}").convert("RGB")
+inputs = processor(images=img, return_tensors="pt")
+with torch.no_grad():
+    features = model.get_image_features(**inputs)
+    features = features / features.norm(dim=-1, keepdim=True)
+print(json.dumps(features[0].cpu().numpy().tolist()))
+`;
+                const scriptFile = join50(tmpdir14(), `mm-clip-${Date.now()}.py`);
+                writeFileSync15(scriptFile, clipScript);
+                const clipOutput = execSync38(`${venvPy} ${scriptFile}`, { encoding: "utf8", timeout: 12e4, env: { ...process.env, PYTHONUNBUFFERED: "1" } });
+                const embedding = JSON.parse(clipOutput.trim().split("\n").pop());
+                episode.visual.clipEmbedding = embedding;
+                results.push(`CLIP embedding computed (${embedding.length}d)`);
+              }
+            } catch {
+            }
             try {
               const venvPy = join50(homedir15(), ".open-agents", "vision-ml-venv", "bin", "python3");
               if (existsSync36(venvPy)) {
@@ -258188,6 +258213,28 @@ Recall later: multimodal_memory action=recall query="${personName}"`,
           return { success: true, output: "No multi-modal episodes recorded yet. Use 'capture' or 'meet' first.", durationMs: performance.now() - start2 };
         }
         const queryLower = query.toLowerCase();
+        let queryClipEmbedding = null;
+        try {
+          const venvPy = join50(homedir15(), ".open-agents", "vision-ml-venv", "bin", "python3");
+          if (existsSync36(venvPy)) {
+            const clipTextScript = `
+import json, torch
+from transformers import CLIPProcessor, CLIPModel
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+inputs = processor(text=["${query.replace(/"/g, '\\"').replace(/\n/g, " ")}"], return_tensors="pt", padding=True)
+with torch.no_grad():
+    features = model.get_text_features(**inputs)
+    features = features / features.norm(dim=-1, keepdim=True)
+print(json.dumps(features[0].cpu().numpy().tolist()))
+`;
+            const scriptFile = join50(tmpdir14(), `mm-clipq-${Date.now()}.py`);
+            writeFileSync15(scriptFile, clipTextScript);
+            const output = execSync38(`${venvPy} ${scriptFile}`, { encoding: "utf8", timeout: 6e4, env: { ...process.env, PYTHONUNBUFFERED: "1" } });
+            queryClipEmbedding = JSON.parse(output.trim().split("\n").pop());
+          }
+        } catch {
+        }
         const scored = episodes.map((ep) => {
           let score = 0;
           const matchedModalities = [];
@@ -258219,6 +258266,13 @@ Recall later: multimodal_memory action=recall query="${personName}"`,
             score += 4;
             matchedModalities.push(`location: ${ep.spatial.locationLabel}`);
           }
+          if (queryClipEmbedding && ep.visual?.clipEmbedding) {
+            const clipSim = this.cosineSim(queryClipEmbedding, ep.visual.clipEmbedding);
+            if (clipSim > 0.2) {
+              score += clipSim * 8;
+              matchedModalities.push(`CLIP visual: ${(clipSim * 100).toFixed(0)}%`);
+            }
+          }
           const hoursSince = (Date.now() - ep.timestamp) / 36e5;
           const recency = Math.pow(0.995, hoursSince);
           score *= 0.5 + 0.5 * recency;
@@ -258311,6 +258365,18 @@ ${lines.join("\n")}`,
           return null;
         }
       }
+      cosineSim(a2, b) {
+        if (a2.length !== b.length || a2.length === 0)
+          return 0;
+        let dot = 0, normA = 0, normB = 0;
+        for (let i2 = 0; i2 < a2.length; i2++) {
+          dot += a2[i2] * b[i2];
+          normA += a2[i2] * a2[i2];
+          normB += b[i2] * b[i2];
+        }
+        const denom = Math.sqrt(normA) * Math.sqrt(normB);
+        return denom > 0 ? dot / denom : 0;
+      }
       loadAllEpisodes() {
         const episodes = [];
         try {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "open-agents-ai",
-  "version": "0.187.164",
+  "version": "0.187.165",
   "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
   "type": "module",
   "main": "./dist/index.js",