open-agents-ai 0.187.164 → 0.187.165
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +67 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -257987,8 +257987,33 @@ var init_multimodal_memory = __esm({
|
|
|
257987
257987
|
} catch {
|
|
257988
257988
|
}
|
|
257989
257989
|
if (existsSync36(imagePath)) {
|
|
257990
|
-
episode.visual = { faceIds: [], faceNames: [], objects: [], imagePath };
|
|
257990
|
+
episode.visual = { faceIds: [], faceNames: [], objects: [], imagePath, clipEmbedding: null };
|
|
257991
257991
|
results.push("Photo captured");
|
|
257992
|
+
try {
|
|
257993
|
+
const venvPy = join50(homedir15(), ".open-agents", "vision-ml-venv", "bin", "python3");
|
|
257994
|
+
if (existsSync36(venvPy)) {
|
|
257995
|
+
const clipScript = `
|
|
257996
|
+
import json, torch
|
|
257997
|
+
from PIL import Image
|
|
257998
|
+
from transformers import CLIPProcessor, CLIPModel
|
|
257999
|
+
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
|
258000
|
+
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
|
258001
|
+
img = Image.open("${imagePath}").convert("RGB")
|
|
258002
|
+
inputs = processor(images=img, return_tensors="pt")
|
|
258003
|
+
with torch.no_grad():
|
|
258004
|
+
features = model.get_image_features(**inputs)
|
|
258005
|
+
features = features / features.norm(dim=-1, keepdim=True)
|
|
258006
|
+
print(json.dumps(features[0].cpu().numpy().tolist()))
|
|
258007
|
+
`;
|
|
258008
|
+
const scriptFile = join50(tmpdir14(), `mm-clip-${Date.now()}.py`);
|
|
258009
|
+
writeFileSync15(scriptFile, clipScript);
|
|
258010
|
+
const clipOutput = execSync38(`${venvPy} ${scriptFile}`, { encoding: "utf8", timeout: 12e4, env: { ...process.env, PYTHONUNBUFFERED: "1" } });
|
|
258011
|
+
const embedding = JSON.parse(clipOutput.trim().split("\n").pop());
|
|
258012
|
+
episode.visual.clipEmbedding = embedding;
|
|
258013
|
+
results.push(`CLIP embedding computed (${embedding.length}d)`);
|
|
258014
|
+
}
|
|
258015
|
+
} catch {
|
|
258016
|
+
}
|
|
257992
258017
|
try {
|
|
257993
258018
|
const venvPy = join50(homedir15(), ".open-agents", "vision-ml-venv", "bin", "python3");
|
|
257994
258019
|
if (existsSync36(venvPy)) {
|
|
@@ -258188,6 +258213,28 @@ Recall later: multimodal_memory action=recall query="${personName}"`,
|
|
|
258188
258213
|
return { success: true, output: "No multi-modal episodes recorded yet. Use 'capture' or 'meet' first.", durationMs: performance.now() - start2 };
|
|
258189
258214
|
}
|
|
258190
258215
|
const queryLower = query.toLowerCase();
|
|
258216
|
+
let queryClipEmbedding = null;
|
|
258217
|
+
try {
|
|
258218
|
+
const venvPy = join50(homedir15(), ".open-agents", "vision-ml-venv", "bin", "python3");
|
|
258219
|
+
if (existsSync36(venvPy)) {
|
|
258220
|
+
const clipTextScript = `
|
|
258221
|
+
import json, torch
|
|
258222
|
+
from transformers import CLIPProcessor, CLIPModel
|
|
258223
|
+
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
|
258224
|
+
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
|
258225
|
+
inputs = processor(text=["${query.replace(/"/g, '\\"').replace(/\n/g, " ")}"], return_tensors="pt", padding=True)
|
|
258226
|
+
with torch.no_grad():
|
|
258227
|
+
features = model.get_text_features(**inputs)
|
|
258228
|
+
features = features / features.norm(dim=-1, keepdim=True)
|
|
258229
|
+
print(json.dumps(features[0].cpu().numpy().tolist()))
|
|
258230
|
+
`;
|
|
258231
|
+
const scriptFile = join50(tmpdir14(), `mm-clipq-${Date.now()}.py`);
|
|
258232
|
+
writeFileSync15(scriptFile, clipTextScript);
|
|
258233
|
+
const output = execSync38(`${venvPy} ${scriptFile}`, { encoding: "utf8", timeout: 6e4, env: { ...process.env, PYTHONUNBUFFERED: "1" } });
|
|
258234
|
+
queryClipEmbedding = JSON.parse(output.trim().split("\n").pop());
|
|
258235
|
+
}
|
|
258236
|
+
} catch {
|
|
258237
|
+
}
|
|
258191
258238
|
const scored = episodes.map((ep) => {
|
|
258192
258239
|
let score = 0;
|
|
258193
258240
|
const matchedModalities = [];
|
|
@@ -258219,6 +258266,13 @@ Recall later: multimodal_memory action=recall query="${personName}"`,
|
|
|
258219
258266
|
score += 4;
|
|
258220
258267
|
matchedModalities.push(`location: ${ep.spatial.locationLabel}`);
|
|
258221
258268
|
}
|
|
258269
|
+
if (queryClipEmbedding && ep.visual?.clipEmbedding) {
|
|
258270
|
+
const clipSim = this.cosineSim(queryClipEmbedding, ep.visual.clipEmbedding);
|
|
258271
|
+
if (clipSim > 0.2) {
|
|
258272
|
+
score += clipSim * 8;
|
|
258273
|
+
matchedModalities.push(`CLIP visual: ${(clipSim * 100).toFixed(0)}%`);
|
|
258274
|
+
}
|
|
258275
|
+
}
|
|
258222
258276
|
const hoursSince = (Date.now() - ep.timestamp) / 36e5;
|
|
258223
258277
|
const recency = Math.pow(0.995, hoursSince);
|
|
258224
258278
|
score *= 0.5 + 0.5 * recency;
|
|
@@ -258311,6 +258365,18 @@ ${lines.join("\n")}`,
|
|
|
258311
258365
|
return null;
|
|
258312
258366
|
}
|
|
258313
258367
|
}
|
|
258368
|
+
cosineSim(a2, b) {
|
|
258369
|
+
if (a2.length !== b.length || a2.length === 0)
|
|
258370
|
+
return 0;
|
|
258371
|
+
let dot = 0, normA = 0, normB = 0;
|
|
258372
|
+
for (let i2 = 0; i2 < a2.length; i2++) {
|
|
258373
|
+
dot += a2[i2] * b[i2];
|
|
258374
|
+
normA += a2[i2] * a2[i2];
|
|
258375
|
+
normB += b[i2] * b[i2];
|
|
258376
|
+
}
|
|
258377
|
+
const denom = Math.sqrt(normA) * Math.sqrt(normB);
|
|
258378
|
+
return denom > 0 ? dot / denom : 0;
|
|
258379
|
+
}
|
|
258314
258380
|
loadAllEpisodes() {
|
|
258315
258381
|
const episodes = [];
|
|
258316
258382
|
try {
|
package/package.json
CHANGED