open-agents-ai 0.187.593 → 0.187.594

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -503992,6 +503992,13 @@ ${cameras.join("\n")}`,
503992
503992
  }
503993
503993
  const data = readFileSync29(filePath);
503994
503994
  const sizeKB = Math.round(data.length / 1024);
503995
+ const outputPath = userOutputPath ?? join55(process.cwd(), ".oa", "camera-captures", `capture-${Date.now()}.jpg`);
503996
+ if (!userOutputPath) {
503997
+ mkdirSync14(join55(process.cwd(), ".oa", "camera-captures"), {
503998
+ recursive: true
503999
+ });
504000
+ writeFileSync15(outputPath, data);
504001
+ }
503995
504002
  if (userOutputPath) {
503996
504003
  return {
503997
504004
  success: true,
@@ -504005,12 +504012,13 @@ Saved to: ${userOutputPath}`,
504005
504012
  } catch {
504006
504013
  }
504007
504014
  const base642 = data.toString("base64");
504015
+ const display = `Captured ${resolution} frame from ${source} (${sizeKB}KB JPEG).
504016
+ Saved to: ${outputPath}`;
504008
504017
  return {
504009
504018
  success: true,
504010
- output: `Captured ${resolution} frame from ${source} (${sizeKB}KB JPEG).
504011
-
504012
- Base64 image data (use with vision tools):
504013
- data:image/jpeg;base64,${base642}`,
504019
+ output: display,
504020
+ llmContent: `${display}
504021
+ [IMAGE_BASE64:image/jpeg:${base642}]`,
504014
504022
  durationMs: performance.now() - start2
504015
504023
  };
504016
504024
  }
@@ -531235,19 +531243,22 @@ TASK: ${task}` : task;
531235
531243
  web_fetch: 4,
531236
531244
  list_directory: 12,
531237
531245
  find_files: 10,
531238
- grep_search: 12
531246
+ grep_search: 12,
531247
+ camera_capture: 3
531239
531248
  } : loopTier === "medium" ? {
531240
531249
  web_search: 10,
531241
531250
  web_fetch: 8,
531242
531251
  list_directory: 18,
531243
531252
  find_files: 14,
531244
- grep_search: 18
531253
+ grep_search: 18,
531254
+ camera_capture: 4
531245
531255
  } : {
531246
531256
  web_search: 20,
531247
531257
  web_fetch: 15,
531248
531258
  list_directory: 30,
531249
531259
  find_files: 20,
531250
- grep_search: 30
531260
+ grep_search: 30,
531261
+ camera_capture: 5
531251
531262
  };
531252
531263
  for (const [tool, budget] of Object.entries(toolBudgets)) {
531253
531264
  toolCallBudget.set(tool, budget);
@@ -534464,6 +534475,9 @@ Respond with EXACTLY this structure before your next tool call:
534464
534475
  };
534465
534476
  }
534466
534477
  }
534478
+ if (result.success) {
534479
+ result = await this.offloadEmbeddedImageResult(result, tc.name, turn);
534480
+ }
534467
534481
  let output = this.normalizeToolOutput(result, tc.name, tc.arguments, turn);
534468
534482
  if (!result.success && (this.options.modelTier === "small" || this.options.modelTier === "medium")) {
534469
534483
  const recovery = this.buildRecoveryGuidance(tc.name, result.error ?? "", tc.arguments);
@@ -536881,25 +536895,14 @@ Integrate this guidance into your current approach. Continue working on the task
536881
536895
  turn,
536882
536896
  timestamp: (/* @__PURE__ */ new Date()).toISOString()
536883
536897
  });
536884
- const tmpImgPath = this.writeTempImageForOcr(mime, base642);
536885
- const [visionOutcome, ocrOutcome] = await Promise.allSettled([
536886
- this.describeImageViaVisionSubagent(imageUrl, textContent),
536887
- tmpImgPath ? this.extractImageOcrText(tmpImgPath) : Promise.resolve("")
536888
- ]);
536889
- const visionDesc = visionOutcome.status === "fulfilled" ? visionOutcome.value.trim() : "";
536890
- const ocrText = ocrOutcome.status === "fulfilled" ? ocrOutcome.value.trim() : "";
536891
- if (visionDesc || ocrText) {
536892
- const sections = [];
536893
- if (visionDesc)
536894
- sections.push(`[Image analysis]: ${visionDesc}`);
536895
- if (ocrText)
536896
- sections.push(`[OCR extracted text]: ${ocrText}`);
536898
+ const analysis = await this.analyzeImageDataForContext(mime, base642, textContent);
536899
+ if (analysis.contextBlock) {
536897
536900
  const userPrefix = textContent ? `[User added context]: ${textContent}
536898
536901
 
536899
536902
  ` : "[User shared an image]. ";
536900
536903
  messages2.push({
536901
536904
  role: "user",
536902
- content: userPrefix + sections.join("\n\n") + "\n\nIntegrate this visual information into your current approach."
536905
+ content: userPrefix + analysis.contextBlock + "\n\nIntegrate this visual information into your current approach."
536903
536906
  });
536904
536907
  this.emit({
536905
536908
  type: "status",
@@ -536909,7 +536912,7 @@ Integrate this guidance into your current approach. Continue working on the task
536909
536912
  });
536910
536913
  return;
536911
536914
  }
536912
- const reason = visionOutcome.status === "rejected" ? String(visionOutcome.reason?.message ?? visionOutcome.reason) : "vision and OCR returned no text";
536915
+ const reason = analysis.errorReason || "vision and OCR returned no text";
536913
536916
  this.emit({
536914
536917
  type: "status",
536915
536918
  content: `Image offload unavailable (${reason}); falling back to inline image`,
@@ -536918,6 +536921,71 @@ Integrate this guidance into your current approach. Continue working on the task
536918
536921
  });
536919
536922
  this.appendInlineImageMessage(messages2, imageUrl, textContent);
536920
536923
  }
536924
+ async offloadEmbeddedImageResult(result, toolName, turn) {
536925
+ const modelSource = result.llmContent ?? result.output;
536926
+ const image = this.extractFirstEmbeddedImage(modelSource);
536927
+ if (!image)
536928
+ return result;
536929
+ this.emit({
536930
+ type: "status",
536931
+ content: `${toolName}: offloading embedded image analysis outside main context`,
536932
+ turn,
536933
+ timestamp: (/* @__PURE__ */ new Date()).toISOString()
536934
+ });
536935
+ const analysis = await this.analyzeImageDataForContext(image.mime, image.base64, image.textWithoutImage.slice(0, 2e3));
536936
+ const imageNote = analysis.contextBlock ? `${analysis.contextBlock}
536937
+
536938
+ Use this image analysis. Do not repeat ${toolName} with the same arguments unless the scene has changed.` : `[Embedded image data omitted from model context; ${analysis.errorReason || "vision and OCR returned no text"}. Use any saved image path above with vision/image_read if further inspection is needed.]`;
536939
+ return {
536940
+ ...result,
536941
+ llmContent: `${image.textWithoutImage.trim()}
536942
+
536943
+ ${imageNote}`.trim()
536944
+ };
536945
+ }
536946
+ extractFirstEmbeddedImage(text) {
536947
+ const markerPattern = /\[IMAGE_BASE64:([^:\]]+):([^\]]+)\]/;
536948
+ const markerMatch = text.match(markerPattern);
536949
+ if (markerMatch) {
536950
+ const mime2 = markerMatch[1];
536951
+ const base643 = markerMatch[2];
536952
+ return {
536953
+ mime: mime2,
536954
+ base64: base643,
536955
+ textWithoutImage: text.replace(markerPattern, `[image data omitted: ${mime2}, ${base643.length} base64 chars]`).trim()
536956
+ };
536957
+ }
536958
+ const dataUrlPattern = /data:(image\/[a-zA-Z0-9.+-]+);base64,([A-Za-z0-9+/=]+)/;
536959
+ const dataUrlMatch = text.match(dataUrlPattern);
536960
+ if (!dataUrlMatch)
536961
+ return null;
536962
+ const mime = dataUrlMatch[1];
536963
+ const base642 = dataUrlMatch[2];
536964
+ return {
536965
+ mime,
536966
+ base64: base642,
536967
+ textWithoutImage: text.replace(dataUrlPattern, `[image data omitted: ${mime}, ${base642.length} base64 chars]`).trim()
536968
+ };
536969
+ }
536970
+ async analyzeImageDataForContext(mime, base642, textContent) {
536971
+ const imageUrl = `data:${mime};base64,${base642}`;
536972
+ const tmpImgPath = this.writeTempImageForOcr(mime, base642);
536973
+ const [visionOutcome, ocrOutcome] = await Promise.allSettled([
536974
+ this.describeImageViaVisionSubagent(imageUrl, textContent),
536975
+ tmpImgPath ? this.extractImageOcrText(tmpImgPath) : Promise.resolve("")
536976
+ ]);
536977
+ const visionDesc = visionOutcome.status === "fulfilled" ? visionOutcome.value.trim() : "";
536978
+ const ocrText = ocrOutcome.status === "fulfilled" ? ocrOutcome.value.trim() : "";
536979
+ const sections = [];
536980
+ if (visionDesc)
536981
+ sections.push(`[Image analysis]: ${visionDesc}`);
536982
+ if (ocrText)
536983
+ sections.push(`[OCR extracted text]: ${ocrText}`);
536984
+ if (sections.length > 0)
536985
+ return { contextBlock: sections.join("\n\n") };
536986
+ const errorReason = visionOutcome.status === "rejected" ? String(visionOutcome.reason?.message ?? visionOutcome.reason) : void 0;
536987
+ return { contextBlock: "", errorReason };
536988
+ }
536921
536989
  async describeImageViaVisionSubagent(imageUrl, textContent) {
536922
536990
  const visionMessages = [
536923
536991
  {
@@ -591936,6 +592004,36 @@ function normalizeTelegramMedia(message2) {
591936
592004
  }
591937
592005
  return void 0;
591938
592006
  }
592007
+ function telegramMediaIsImage(media) {
592008
+ if (media.type === "photo") return true;
592009
+ if (media.mimeType?.toLowerCase().startsWith("image/")) return true;
592010
+ return /\.(png|jpe?g|gif|webp|bmp|tiff?)$/i.test(media.fileName ?? "");
592011
+ }
592012
+ function telegramImageExtension(media) {
592013
+ const fileName = media.fileName ?? "";
592014
+ const dotIdx = fileName.lastIndexOf(".");
592015
+ if (dotIdx >= 0) {
592016
+ const ext = fileName.slice(dotIdx).toLowerCase();
592017
+ if (/^\.(png|jpe?g|gif|webp|bmp|tiff?)$/.test(ext)) return ext;
592018
+ }
592019
+ const mime = media.mimeType?.toLowerCase() ?? "";
592020
+ if (mime.includes("png")) return ".png";
592021
+ if (mime.includes("webp")) return ".webp";
592022
+ if (mime.includes("gif")) return ".gif";
592023
+ if (mime.includes("bmp")) return ".bmp";
592024
+ if (mime.includes("tiff")) return ".tif";
592025
+ return ".jpg";
592026
+ }
592027
+ function telegramImageMime(media) {
592028
+ if (media.mimeType?.toLowerCase().startsWith("image/")) return media.mimeType;
592029
+ const ext = telegramImageExtension(media);
592030
+ if (ext === ".png") return "image/png";
592031
+ if (ext === ".webp") return "image/webp";
592032
+ if (ext === ".gif") return "image/gif";
592033
+ if (ext === ".bmp") return "image/bmp";
592034
+ if (ext === ".tif" || ext === ".tiff") return "image/tiff";
592035
+ return "image/jpeg";
592036
+ }
591939
592037
  function normalizeTelegramUpdate(update2) {
591940
592038
  const sourceUpdateType = update2.guest_message ? "guest_message" : update2.message ? "message" : null;
591941
592039
  if (!sourceUpdateType) return null;
@@ -592794,12 +592892,22 @@ Join: ${newUrl}`);
592794
592892
  }
592795
592893
  const existing = this.subAgents.get(sessionKey);
592796
592894
  if (existing && !existing.aborted) {
592797
- this.recordChatHistory(sessionKey, { role: "user", text: msg.text, mode: "steering" });
592895
+ let steeringText = msg.text;
592896
+ if (msg.media) {
592897
+ const mediaContext = await this.processMedia(msg);
592898
+ if (mediaContext) {
592899
+ steeringText += `
592900
+
592901
+ [Media attached — processed content below]
592902
+ ${mediaContext}`;
592903
+ }
592904
+ }
592905
+ this.recordChatHistory(sessionKey, { role: "user", text: steeringText, mode: "steering" });
592798
592906
  if (existing.runner) {
592799
- existing.runner.injectUserMessage(msg.text);
592907
+ existing.runner.injectUserMessage(steeringText);
592800
592908
  this.tuiWrite(() => renderTelegramSubAgentEvent(msg.username, "mid-conversation steering injected"));
592801
592909
  } else {
592802
- existing.pendingMessages.push(msg.text);
592910
+ existing.pendingMessages.push(steeringText);
592803
592911
  this.tuiWrite(() => renderTelegramSubAgentEvent(msg.username, `queued (${existing.pendingMessages.length} pending)`));
592804
592912
  }
592805
592913
  return;
@@ -593494,8 +593602,9 @@ Todo/session id: ${sessionContext.sessionId}` : `Telegram ${isGroup ? "group" :
593494
593602
  async processMedia(msg) {
593495
593603
  if (!msg.media) return "";
593496
593604
  const { type, fileId, fileUniqueId, mimeType, caption } = msg.media;
593605
+ const isImageMedia = telegramMediaIsImage(msg.media);
593497
593606
  let ext = ".bin";
593498
- if (type === "photo") ext = ".jpg";
593607
+ if (isImageMedia) ext = telegramImageExtension(msg.media);
593499
593608
  else if (type === "audio" || type === "voice") ext = ".ogg";
593500
593609
  else if (type === "video" || type === "video_note" || type === "live_photo") ext = ".mp4";
593501
593610
  else if (msg.media.fileName) {
@@ -593526,23 +593635,27 @@ Todo/session id: ${sessionContext.sessionId}` : `Telegram ${isGroup ? "group" :
593526
593635
  username: msg.username
593527
593636
  });
593528
593637
  let description = `[${type}${caption ? `: ${caption}` : ""}]`;
593529
- if (type === "photo") {
593638
+ if (isImageMedia) {
593530
593639
  let visionContext = "";
593531
593640
  try {
593532
593641
  const { runVisionIngress: runVisionIngress2, formatImageContextPrefix: formatImageContextPrefix2 } = await Promise.resolve().then(() => (init_vision_ingress(), vision_ingress_exports));
593533
593642
  const ingressResult = await runVisionIngress2(
593534
- { path: localPath, buffer: Buffer.from(""), mime: "image/png" },
593535
- ""
593643
+ {
593644
+ path: localPath,
593645
+ buffer: readFileSync84(localPath),
593646
+ mime: telegramImageMime(msg.media)
593647
+ },
593648
+ this.agentConfig?.model ?? ""
593536
593649
  );
593537
593650
  visionContext = formatImageContextPrefix2(ingressResult);
593538
593651
  cacheEntry.extractedContent = ingressResult.contextBlock;
593539
593652
  } catch {
593540
593653
  }
593541
593654
  if (visionContext) {
593542
- description = `[Photo received: ${localPath}${caption ? ` — caption: "${caption}"` : ""}
593655
+ description = `[Image received: ${localPath}${caption ? ` — caption: "${caption}"` : ""}
593543
593656
  ${visionContext}]`;
593544
593657
  } else {
593545
- description = `[Photo received and saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}. You can use image_read or vision tools to analyze it if available.]`;
593658
+ description = `[Image received and saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}. You can use image_read or vision tools to analyze it if available.]`;
593546
593659
  }
593547
593660
  try {
593548
593661
  await fetch("http://127.0.0.1:11435/v1/memory/ingest", {
@@ -593553,8 +593666,7 @@ ${visionContext}]`;
593553
593666
  });
593554
593667
  } catch {
593555
593668
  }
593556
- }
593557
- if (type === "audio" || type === "voice") {
593669
+ } else if (type === "audio" || type === "voice") {
593558
593670
  let transcription = null;
593559
593671
  try {
593560
593672
  const { getListenEngine: getListenEngine2 } = await Promise.resolve().then(() => (init_listen(), listen_exports));
@@ -593580,12 +593692,10 @@ ${visionContext}]`;
593580
593692
  });
593581
593693
  } catch {
593582
593694
  }
593583
- }
593584
- if (type === "video" || type === "video_note" || type === "live_photo") {
593695
+ } else if (type === "video" || type === "video_note" || type === "live_photo") {
593585
593696
  const label = type === "live_photo" ? "Live photo" : "Video";
593586
593697
  description = `[${label} received and saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}.]`;
593587
- }
593588
- if (type === "document") {
593698
+ } else if (type === "document") {
593589
593699
  description = `[Document received: ${msg.media.fileName || "unnamed"}${mimeType ? ` (${mimeType})` : ""}, saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}.]`;
593590
593700
  }
593591
593701
  cacheEntry.extractedContent = description;
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "open-agents-ai",
3
- "version": "0.187.593",
3
+ "version": "0.187.594",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "open-agents-ai",
9
- "version": "0.187.593",
9
+ "version": "0.187.594",
10
10
  "hasInstallScript": true,
11
11
  "license": "CC-BY-NC-4.0",
12
12
  "dependencies": {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "open-agents-ai",
3
- "version": "0.187.593",
3
+ "version": "0.187.594",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",