@holoscript/holoscript-agent 2.1.3 → 2.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +53 -5
- package/dist/index.js.map +1 -1
- package/dist/runner.js +53 -5
- package/dist/runner.js.map +1 -1
- package/dist/supervisor.js +53 -5
- package/dist/supervisor.js.map +1 -1
- package/package.json +1 -1
package/dist/supervisor.js
CHANGED
|
@@ -535,13 +535,13 @@ var MESH_TOOLS = [
|
|
|
535
535
|
},
|
|
536
536
|
{
|
|
537
537
|
name: "vision_analyze",
|
|
538
|
-
description: "Analyze an image using the local Fara-7B vision model (Ollama on loopback). Reads the image file at `image_path
|
|
538
|
+
description: "Analyze an image using the local Fara-7B vision model (Ollama on loopback). Reads the image file at `image_path` (max 512KB \u2014 downscale larger images first), sends it to the vision model via the local Ollama API (env: HOLOSCRIPT_AGENT_VISION_MODEL), and returns the model's text analysis. Counts as a productive tool call \u2014 use for GUI-grounding, visual QA, image captioning, or any task that requires perceiving image content. Only available on surfaces with a local Ollama instance and HOLOSCRIPT_AGENT_LOCAL_LLM_BASE_URL set.",
|
|
539
539
|
input_schema: {
|
|
540
540
|
type: "object",
|
|
541
541
|
properties: {
|
|
542
542
|
image_path: {
|
|
543
543
|
type: "string",
|
|
544
|
-
description: "Absolute path to the image file (png, jpg, webp
|
|
544
|
+
description: "Absolute path to the image file (png, jpg, webp) \u2014 must be under 512KB"
|
|
545
545
|
},
|
|
546
546
|
prompt: {
|
|
547
547
|
type: "string",
|
|
@@ -549,7 +549,7 @@ var MESH_TOOLS = [
|
|
|
549
549
|
},
|
|
550
550
|
model: {
|
|
551
551
|
type: "string",
|
|
552
|
-
description:
|
|
552
|
+
description: "Ollama model tag override (default: HOLOSCRIPT_AGENT_VISION_MODEL env var)"
|
|
553
553
|
}
|
|
554
554
|
},
|
|
555
555
|
required: ["image_path"]
|
|
@@ -803,19 +803,29 @@ ${truncated}`);
|
|
|
803
803
|
const denied = checkReadAllowed(imagePath);
|
|
804
804
|
if (denied) return errResult(use.id, `vision_analyze: ${denied}`);
|
|
805
805
|
const prompt = String(use.input.prompt ?? "Describe this image in detail.");
|
|
806
|
-
const model = String(
|
|
806
|
+
const model = String(
|
|
807
|
+
use.input.model ?? process.env.HOLOSCRIPT_AGENT_VISION_MODEL ?? "fara:7b"
|
|
808
|
+
);
|
|
807
809
|
const ollamaBase = process.env.HOLOSCRIPT_AGENT_LOCAL_LLM_BASE_URL;
|
|
808
810
|
if (!ollamaBase) {
|
|
809
811
|
return errResult(
|
|
810
812
|
use.id,
|
|
811
|
-
"vision_analyze: HOLOSCRIPT_AGENT_LOCAL_LLM_BASE_URL is not set \u2014 configure it to point to your local Ollama instance
|
|
813
|
+
"vision_analyze: HOLOSCRIPT_AGENT_LOCAL_LLM_BASE_URL is not set \u2014 configure it to point to your local Ollama instance"
|
|
812
814
|
);
|
|
813
815
|
}
|
|
816
|
+
const MAX_IMAGE_BYTES = 512e3;
|
|
814
817
|
const TIMEOUT_MS = 12e4;
|
|
815
818
|
const controller = new AbortController();
|
|
816
819
|
const timer = setTimeout(() => controller.abort(), TIMEOUT_MS);
|
|
817
820
|
try {
|
|
818
821
|
const imageBytes = await readFile2(imagePath);
|
|
822
|
+
if (imageBytes.length > MAX_IMAGE_BYTES) {
|
|
823
|
+
clearTimeout(timer);
|
|
824
|
+
return errResult(
|
|
825
|
+
use.id,
|
|
826
|
+
`vision_analyze: image is ${Math.round(imageBytes.length / 1024)}KB \u2014 exceeds ${MAX_IMAGE_BYTES / 1024}KB limit. Downscale the image first (e.g. to 256\xD7256 or smaller) then retry vision_analyze.`
|
|
827
|
+
);
|
|
828
|
+
}
|
|
819
829
|
const imageB64 = imageBytes.toString("base64");
|
|
820
830
|
const res = await fetch(`${ollamaBase}/api/generate`, {
|
|
821
831
|
method: "POST",
|
|
@@ -1290,6 +1300,44 @@ Call write_file NOW. Embed ALL data from the tool result above into the content.
|
|
|
1290
1300
|
finalText = reResp.content;
|
|
1291
1301
|
lastResponse = reResp;
|
|
1292
1302
|
}
|
|
1303
|
+
const WRITE_NAMES = /* @__PURE__ */ new Set(["write_file", "str_replace"]);
|
|
1304
|
+
if (toolsCalled.has("vision_analyze") && ![...toolsCalled].some((n) => WRITE_NAMES.has(n)) && iters < MAX_TOOL_ITERS) {
|
|
1305
|
+
iters++;
|
|
1306
|
+
if (messages.length > 0 && messages[messages.length - 1].role === "assistant") {
|
|
1307
|
+
messages.pop();
|
|
1308
|
+
}
|
|
1309
|
+
messages.push({
|
|
1310
|
+
role: "user",
|
|
1311
|
+
content: `vision_analyze returned a caption but you did NOT call write_file.
|
|
1312
|
+
Task: ${target.title}
|
|
1313
|
+
Output path: ${target.description.match(/path[:\s]+([^\s\n,]+\.json)/i)?.[1] ?? "see task description"}
|
|
1314
|
+
Call write_file NOW. Put the caption from vision_analyze into the JSON content field. Do NOT output text \u2014 your ONLY valid response is a write_file tool call.`
|
|
1315
|
+
});
|
|
1316
|
+
const vwResp = await provider.complete(
|
|
1317
|
+
{ messages, maxTokens: 8192, temperature: 0, tools: activeTools },
|
|
1318
|
+
identity.llmModel
|
|
1319
|
+
);
|
|
1320
|
+
aggUsage = {
|
|
1321
|
+
promptTokens: aggUsage.promptTokens + vwResp.usage.promptTokens,
|
|
1322
|
+
completionTokens: aggUsage.completionTokens + vwResp.usage.completionTokens,
|
|
1323
|
+
totalTokens: aggUsage.totalTokens + vwResp.usage.totalTokens
|
|
1324
|
+
};
|
|
1325
|
+
if (vwResp.finishReason === "tool_use" && vwResp.toolUses && vwResp.toolUses.length > 0) {
|
|
1326
|
+
log({ ev: "vision-write-call", taskId: target.id, iter: iters, tools: vwResp.toolUses.map((t) => t.name) });
|
|
1327
|
+
const vwProd = summarizeToolProductivity(vwResp.toolUses);
|
|
1328
|
+
for (const n of vwProd.names) toolsCalled.add(n);
|
|
1329
|
+
productiveCallCount += vwProd.productiveCount;
|
|
1330
|
+
messages.push({ role: "assistant", content: vwResp.assistantBlocks ?? [] });
|
|
1331
|
+
const vwResults = await Promise.all(
|
|
1332
|
+
vwResp.toolUses.map(
|
|
1333
|
+
(u) => runTool(u, { signReceipt: this.opts.signReceipt, addTask: (tasks2) => mesh.addTasks(tasks2) })
|
|
1334
|
+
)
|
|
1335
|
+
);
|
|
1336
|
+
messages.push({ role: "user", content: vwResults });
|
|
1337
|
+
}
|
|
1338
|
+
finalText = vwResp.content;
|
|
1339
|
+
lastResponse = vwResp;
|
|
1340
|
+
}
|
|
1293
1341
|
const durationMs = Date.now() - start;
|
|
1294
1342
|
if (productiveCallCount === 0) {
|
|
1295
1343
|
log({
|