@holoscript/holoscript-agent 2.1.3 → 2.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1082,13 +1082,13 @@ var MESH_TOOLS = [
1082
1082
  },
1083
1083
  {
1084
1084
  name: "vision_analyze",
1085
- description: "Analyze an image using the local Fara-7B vision model (Ollama on loopback). Reads the image file at `image_path`, sends it to fara:7b via the local Ollama API, and returns the model's text analysis. Counts as a productive tool call \u2014 use for GUI-grounding, visual QA, image captioning, or any task that requires perceiving image content. Only available on surfaces with a local Ollama instance running fara:7b.",
1085
+ description: "Analyze an image using the local Fara-7B vision model (Ollama on loopback). Reads the image file at `image_path` (max 512KB \u2014 downscale larger images first), sends it to the vision model via the local Ollama API (env: HOLOSCRIPT_AGENT_VISION_MODEL), and returns the model's text analysis. Counts as a productive tool call \u2014 use for GUI-grounding, visual QA, image captioning, or any task that requires perceiving image content. Only available on surfaces with a local Ollama instance and HOLOSCRIPT_AGENT_LOCAL_LLM_BASE_URL set.",
1086
1086
  input_schema: {
1087
1087
  type: "object",
1088
1088
  properties: {
1089
1089
  image_path: {
1090
1090
  type: "string",
1091
- description: "Absolute path to the image file (png, jpg, webp, gif)"
1091
+ description: "Absolute path to the image file (png, jpg, webp) \u2014 must be under 512KB"
1092
1092
  },
1093
1093
  prompt: {
1094
1094
  type: "string",
@@ -1096,7 +1096,7 @@ var MESH_TOOLS = [
1096
1096
  },
1097
1097
  model: {
1098
1098
  type: "string",
1099
- description: 'Ollama model tag to use (default: "fara:7b")'
1099
+ description: "Ollama model tag override (default: HOLOSCRIPT_AGENT_VISION_MODEL env var)"
1100
1100
  }
1101
1101
  },
1102
1102
  required: ["image_path"]
@@ -1350,19 +1350,29 @@ ${truncated}`);
1350
1350
  const denied = checkReadAllowed(imagePath);
1351
1351
  if (denied) return errResult(use.id, `vision_analyze: ${denied}`);
1352
1352
  const prompt = String(use.input.prompt ?? "Describe this image in detail.");
1353
- const model = String(use.input.model ?? "fara:7b");
1353
+ const model = String(
1354
+ use.input.model ?? process.env.HOLOSCRIPT_AGENT_VISION_MODEL ?? "fara:7b"
1355
+ );
1354
1356
  const ollamaBase = process.env.HOLOSCRIPT_AGENT_LOCAL_LLM_BASE_URL;
1355
1357
  if (!ollamaBase) {
1356
1358
  return errResult(
1357
1359
  use.id,
1358
- "vision_analyze: HOLOSCRIPT_AGENT_LOCAL_LLM_BASE_URL is not set \u2014 configure it to point to your local Ollama instance (e.g. http://holojetson.local:11434)"
1360
+ "vision_analyze: HOLOSCRIPT_AGENT_LOCAL_LLM_BASE_URL is not set \u2014 configure it to point to your local Ollama instance"
1359
1361
  );
1360
1362
  }
1363
+ const MAX_IMAGE_BYTES = 512e3;
1361
1364
  const TIMEOUT_MS = 12e4;
1362
1365
  const controller = new AbortController();
1363
1366
  const timer = setTimeout(() => controller.abort(), TIMEOUT_MS);
1364
1367
  try {
1365
1368
  const imageBytes = await readFile3(imagePath);
1369
+ if (imageBytes.length > MAX_IMAGE_BYTES) {
1370
+ clearTimeout(timer);
1371
+ return errResult(
1372
+ use.id,
1373
+ `vision_analyze: image is ${Math.round(imageBytes.length / 1024)}KB \u2014 exceeds ${MAX_IMAGE_BYTES / 1024}KB limit. Downscale the image first (e.g. to 256\xD7256 or smaller) then retry vision_analyze.`
1374
+ );
1375
+ }
1366
1376
  const imageB64 = imageBytes.toString("base64");
1367
1377
  const res = await fetch(`${ollamaBase}/api/generate`, {
1368
1378
  method: "POST",
@@ -1837,6 +1847,44 @@ Call write_file NOW. Embed ALL data from the tool result above into the content.
1837
1847
  finalText = reResp.content;
1838
1848
  lastResponse = reResp;
1839
1849
  }
1850
+ const WRITE_NAMES = /* @__PURE__ */ new Set(["write_file", "str_replace"]);
1851
+ if (toolsCalled.has("vision_analyze") && ![...toolsCalled].some((n) => WRITE_NAMES.has(n)) && iters < MAX_TOOL_ITERS) {
1852
+ iters++;
1853
+ if (messages.length > 0 && messages[messages.length - 1].role === "assistant") {
1854
+ messages.pop();
1855
+ }
1856
+ messages.push({
1857
+ role: "user",
1858
+ content: `vision_analyze returned a caption but you did NOT call write_file.
1859
+ Task: ${target.title}
1860
+ Output path: ${target.description.match(/path[:\s]+([^\s\n,]+\.json)/i)?.[1] ?? "see task description"}
1861
+ Call write_file NOW. Put the caption from vision_analyze into the JSON content field. Do NOT output text \u2014 your ONLY valid response is a write_file tool call.`
1862
+ });
1863
+ const vwResp = await provider.complete(
1864
+ { messages, maxTokens: 8192, temperature: 0, tools: activeTools },
1865
+ identity.llmModel
1866
+ );
1867
+ aggUsage = {
1868
+ promptTokens: aggUsage.promptTokens + vwResp.usage.promptTokens,
1869
+ completionTokens: aggUsage.completionTokens + vwResp.usage.completionTokens,
1870
+ totalTokens: aggUsage.totalTokens + vwResp.usage.totalTokens
1871
+ };
1872
+ if (vwResp.finishReason === "tool_use" && vwResp.toolUses && vwResp.toolUses.length > 0) {
1873
+ log({ ev: "vision-write-call", taskId: target.id, iter: iters, tools: vwResp.toolUses.map((t) => t.name) });
1874
+ const vwProd = summarizeToolProductivity(vwResp.toolUses);
1875
+ for (const n of vwProd.names) toolsCalled.add(n);
1876
+ productiveCallCount += vwProd.productiveCount;
1877
+ messages.push({ role: "assistant", content: vwResp.assistantBlocks ?? [] });
1878
+ const vwResults = await Promise.all(
1879
+ vwResp.toolUses.map(
1880
+ (u) => runTool(u, { signReceipt: this.opts.signReceipt, addTask: (tasks2) => mesh.addTasks(tasks2) })
1881
+ )
1882
+ );
1883
+ messages.push({ role: "user", content: vwResults });
1884
+ }
1885
+ finalText = vwResp.content;
1886
+ lastResponse = vwResp;
1887
+ }
1840
1888
  const durationMs = Date.now() - start;
1841
1889
  if (productiveCallCount === 0) {
1842
1890
  log({