open-agents-ai 0.187.164 → 0.187.165

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +67 -1
  2. package/package.json +1 -1
package/dist/index.js CHANGED
@@ -257987,8 +257987,33 @@ var init_multimodal_memory = __esm({
257987
257987
  } catch {
257988
257988
  }
257989
257989
  if (existsSync36(imagePath)) {
257990
- episode.visual = { faceIds: [], faceNames: [], objects: [], imagePath };
257990
+ episode.visual = { faceIds: [], faceNames: [], objects: [], imagePath, clipEmbedding: null };
257991
257991
  results.push("Photo captured");
257992
+ try {
257993
+ const venvPy = join50(homedir15(), ".open-agents", "vision-ml-venv", "bin", "python3");
257994
+ if (existsSync36(venvPy)) {
257995
+ const clipScript = `
257996
+ import json, torch
257997
+ from PIL import Image
257998
+ from transformers import CLIPProcessor, CLIPModel
257999
+ model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
258000
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
258001
+ img = Image.open("${imagePath}").convert("RGB")
258002
+ inputs = processor(images=img, return_tensors="pt")
258003
+ with torch.no_grad():
258004
+ features = model.get_image_features(**inputs)
258005
+ features = features / features.norm(dim=-1, keepdim=True)
258006
+ print(json.dumps(features[0].cpu().numpy().tolist()))
258007
+ `;
258008
+ const scriptFile = join50(tmpdir14(), `mm-clip-${Date.now()}.py`);
258009
+ writeFileSync15(scriptFile, clipScript);
258010
+ const clipOutput = execSync38(`${venvPy} ${scriptFile}`, { encoding: "utf8", timeout: 12e4, env: { ...process.env, PYTHONUNBUFFERED: "1" } });
258011
+ const embedding = JSON.parse(clipOutput.trim().split("\n").pop());
258012
+ episode.visual.clipEmbedding = embedding;
258013
+ results.push(`CLIP embedding computed (${embedding.length}d)`);
258014
+ }
258015
+ } catch {
258016
+ }
257992
258017
  try {
257993
258018
  const venvPy = join50(homedir15(), ".open-agents", "vision-ml-venv", "bin", "python3");
257994
258019
  if (existsSync36(venvPy)) {
@@ -258188,6 +258213,28 @@ Recall later: multimodal_memory action=recall query="${personName}"`,
258188
258213
  return { success: true, output: "No multi-modal episodes recorded yet. Use 'capture' or 'meet' first.", durationMs: performance.now() - start2 };
258189
258214
  }
258190
258215
  const queryLower = query.toLowerCase();
258216
+ let queryClipEmbedding = null;
258217
+ try {
258218
+ const venvPy = join50(homedir15(), ".open-agents", "vision-ml-venv", "bin", "python3");
258219
+ if (existsSync36(venvPy)) {
258220
+ const clipTextScript = `
258221
+ import json, torch
258222
+ from transformers import CLIPProcessor, CLIPModel
258223
+ model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
258224
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
258225
+ inputs = processor(text=["${query.replace(/"/g, '\\"').replace(/\n/g, " ")}"], return_tensors="pt", padding=True)
258226
+ with torch.no_grad():
258227
+ features = model.get_text_features(**inputs)
258228
+ features = features / features.norm(dim=-1, keepdim=True)
258229
+ print(json.dumps(features[0].cpu().numpy().tolist()))
258230
+ `;
258231
+ const scriptFile = join50(tmpdir14(), `mm-clipq-${Date.now()}.py`);
258232
+ writeFileSync15(scriptFile, clipTextScript);
258233
+ const output = execSync38(`${venvPy} ${scriptFile}`, { encoding: "utf8", timeout: 6e4, env: { ...process.env, PYTHONUNBUFFERED: "1" } });
258234
+ queryClipEmbedding = JSON.parse(output.trim().split("\n").pop());
258235
+ }
258236
+ } catch {
258237
+ }
258191
258238
  const scored = episodes.map((ep) => {
258192
258239
  let score = 0;
258193
258240
  const matchedModalities = [];
@@ -258219,6 +258266,13 @@ Recall later: multimodal_memory action=recall query="${personName}"`,
258219
258266
  score += 4;
258220
258267
  matchedModalities.push(`location: ${ep.spatial.locationLabel}`);
258221
258268
  }
258269
+ if (queryClipEmbedding && ep.visual?.clipEmbedding) {
258270
+ const clipSim = this.cosineSim(queryClipEmbedding, ep.visual.clipEmbedding);
258271
+ if (clipSim > 0.2) {
258272
+ score += clipSim * 8;
258273
+ matchedModalities.push(`CLIP visual: ${(clipSim * 100).toFixed(0)}%`);
258274
+ }
258275
+ }
258222
258276
  const hoursSince = (Date.now() - ep.timestamp) / 36e5;
258223
258277
  const recency = Math.pow(0.995, hoursSince);
258224
258278
  score *= 0.5 + 0.5 * recency;
@@ -258311,6 +258365,18 @@ ${lines.join("\n")}`,
258311
258365
  return null;
258312
258366
  }
258313
258367
  }
258368
+ cosineSim(a2, b) {
258369
+ if (a2.length !== b.length || a2.length === 0)
258370
+ return 0;
258371
+ let dot = 0, normA = 0, normB = 0;
258372
+ for (let i2 = 0; i2 < a2.length; i2++) {
258373
+ dot += a2[i2] * b[i2];
258374
+ normA += a2[i2] * a2[i2];
258375
+ normB += b[i2] * b[i2];
258376
+ }
258377
+ const denom = Math.sqrt(normA) * Math.sqrt(normB);
258378
+ return denom > 0 ? dot / denom : 0;
258379
+ }
258314
258380
  loadAllEpisodes() {
258315
258381
  const episodes = [];
258316
258382
  try {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "open-agents-ai",
3
- "version": "0.187.164",
3
+ "version": "0.187.165",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",