npm - @nanhara/hara - Versions diffs - 0.33.0 → 0.48.0 - Mend

@nanhara/hara 0.33.0 → 0.48.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/CHANGELOG.md +152 -1
package/README.md +12 -4
package/dist/index.js +303 -76
package/dist/org/planner.js +19 -0
package/dist/search/semindex.js +62 -11
package/dist/session/store.js +14 -0
package/dist/tools/computer.js +156 -16
package/dist/tui/App.js +40 -5
package/dist/tui/InputBox.js +2 -2
package/dist/vision.js +52 -3
package/package.json +3 -2
package/plugins/browser/.hara-plugin/plugin.json +9 -0
package/plugins/browser/skills/web/SKILL.md +27 -0
package/plugins/chrome/.hara-plugin/plugin.json +9 -0
package/plugins/chrome/skills/chrome/SKILL.md +26 -0

package/dist/search/semindex.js CHANGED Viewed

@@ -2,7 +2,7 @@
 // code-asset / repo / knowledge-base scale (hundreds–low-thousands of chunks); the optional zvec adapter is
 // the scale-up path later. Markdown/code stays the SSOT; this index is a derived, rebuildable, gitignored
 // artifact. The embedder is injected (see embed.ts) so the store + chunking are testable without a model.
-import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs";
+import { readFileSync, writeFileSync, mkdirSync, existsSync, statSync } from "node:fs";
 import { homedir } from "node:os";
 import { join, dirname } from "node:path";
 import { findProjectRoot } from "../context/agents-md.js";
@@ -15,14 +15,22 @@ export function indexPath(name, cwd) {
         return join(findProjectRoot(cwd), ".hara", "index", "repo.json");
     return join(homedir(), ".hara", "index", `${name}.json`);
 }
+function statMtime(p) {
+    try {
+        return statSync(p).mtimeMs;
+    }
+    catch {
+        return 0;
+    }
+}
 /** Split a file into chunks: Markdown by `#` headings, code by ~40-line windows. Heuristic, zero-dep —
- *  also the substrate embeddings reuse. */
-export function chunkText(text, file, source) {
+ *  also the substrate embeddings reuse. `mtime` (when given) is stamped on every chunk for incremental reuse. */
+export function chunkText(text, file, source, mtime) {
     const out = [];
     const push = (body, n) => {
         const t = body.trim();
         if (t.length >= 12)
-            out.push({ id: `${file}#${n}`, text: t.slice(0, 2000), file, source });
+            out.push({ id: `${file}#${n}`, text: t.slice(0, 2000), file, source, mtime });
     };
     if (/\.(md|mdx)$/i.test(file)) {
         const parts = text.split(/^(?=#{1,6}\s)/m);
@@ -49,23 +57,66 @@ function cosine(a, b) {
     }
     return na && nb ? dot / (Math.sqrt(na) * Math.sqrt(nb)) : 0;
 }
-/** Embed all chunks and write the index file. Returns the count written. */
+/** Build/refresh the index. **Incremental**: files whose mtime is unchanged since the last build keep their
+ *  existing vectors (no re-embed); only new/changed files are embedded, and deleted files drop out. A changed
+ *  embedding model forces a full rebuild (old vectors aren't comparable). Returns counts. */
 export async function buildIndex(name, chunks, embed, cwd, model = "embed") {
+    const p = indexPath(name, cwd);
+    // Load the previous index → reuse vectors for unchanged files.
+    const prevByFile = new Map();
+    let prevModel = "";
+    if (existsSync(p)) {
+        try {
+            const old = JSON.parse(readFileSync(p, "utf8"));
+            prevModel = old.model;
+            for (const it of old.items ?? []) {
+                const arr = prevByFile.get(it.file);
+                if (arr)
+                    arr.push(it);
+                else
+                    prevByFile.set(it.file, [it]);
+            }
+        }
+        catch {
+            /* corrupt index → full rebuild */
+        }
+    }
+    const sameModel = prevModel === model;
+    const byFile = new Map();
+    for (const c of chunks) {
+        const arr = byFile.get(c.file);
+        if (arr)
+            arr.push(c);
+        else
+            byFile.set(c.file, [c]);
+    }
     const items = [];
+    const toEmbed = [];
+    let reused = 0;
+    for (const [file, fchunks] of byFile) {
+        const mtime = fchunks[0].mtime ?? 0;
+        const prev = prevByFile.get(file);
+        if (sameModel && prev?.length && mtime > 0 && prev.every((it) => it.mtime === mtime)) {
+            items.push(...prev); // file unchanged → keep its vectors
+            reused += prev.length;
+        }
+        else {
+            toEmbed.push(...fchunks);
+        }
+    }
     const B = 64;
-    for (let i = 0; i < chunks.length; i += B) {
-        const batch = chunks.slice(i, i + B);
+    for (let i = 0; i < toEmbed.length; i += B) {
+        const batch = toEmbed.slice(i, i + B);
         const vecs = await embed(batch.map((c) => c.text));
         batch.forEach((c, j) => vecs[j] && items.push({ ...c, vec: vecs[j] }));
     }
-    const p = indexPath(name, cwd);
     const dir = dirname(p);
     mkdirSync(dir, { recursive: true });
     // The index is derived + rebuildable (and may embed file contents) — never let it be committed.
     if (!existsSync(join(dir, ".gitignore")))
         writeFileSync(join(dir, ".gitignore"), "*\n", "utf8");
     writeFileSync(p, JSON.stringify({ model, items }), "utf8");
-    return items.length;
+    return { total: items.length, embedded: toEmbed.length, reused };
 }
 export function indexExists(name, cwd) {
     return existsSync(indexPath(name, cwd));
@@ -91,7 +142,7 @@ export function collectDirChunks(dir, source) {
         }
         if (isProbablyBinary(buf))
             continue;
-        chunks.push(...chunkText(buf.toString("utf8"), abs, source));
+        chunks.push(...chunkText(buf.toString("utf8"), abs, source, statMtime(abs)));
     }
     return chunks;
 }
@@ -113,7 +164,7 @@ export function collectRepoChunks(root) {
         }
         if (isProbablyBinary(buf))
             continue;
-        chunks.push(...chunkText(buf.toString("utf8"), rel, "repo"));
+        chunks.push(...chunkText(buf.toString("utf8"), rel, "repo", statMtime(abs)));
     }
     return chunks;
 }

package/dist/session/store.js CHANGED Viewed

@@ -56,6 +56,20 @@ export function titleFrom(history) {
     const firstUser = history.find((h) => h.role === "user");
     return deriveTitle(firstUser && firstUser.role === "user" ? firstUser.content : "");
 }
+/** Normalize a phrase to an ASCII kebab-case slug (lowercase, a–z0–9 + single hyphens, capped). Non-ASCII
+ *  is dropped — used to clean a model-generated English session name. Returns "" if nothing ASCII remains. */
+export function slugify(text, max = 40) {
+    return text
+        .trim()
+        .toLowerCase()
+        .replace(/[^a-z0-9\s-]/g, "")
+        .trim()
+        .replace(/\s+/g, "-")
+        .replace(/-+/g, "-")
+        .replace(/^-+/, "")
+        .slice(0, max)
+        .replace(/-+$/, "");
+}
 export function saveSession(meta, history) {
     meta.updatedAt = new Date().toISOString();
     const data = { meta, history };

package/dist/tools/computer.js CHANGED Viewed

@@ -11,7 +11,7 @@ import { join } from "node:path";
 import { registerTool } from "./registry.js";
 import { loadConfig } from "../config.js";
 const RANK = { off: 0, read: 1, click: 2, full: 3 };
-const ACTION_MIN = { screenshot: "read", move: "click", click: "click", type: "full", key: "full" };
+const ACTION_MIN = { screenshot: "read", find: "read", activate: "click", move: "click", click: "click", type: "full", key: "full" };
 // dangerous combos refused even at full tier (quit / close / delete / task-switch-kill)
 const KEY_BLOCK = /(?:\b(cmd|command|ctrl|control|alt|option|win|super|meta)\b.*\+.*\b(q|w|delete|del|f4|escape|esc)\b)|ctrl\+alt\+(?:delete|del|backspace)/i;
 /** Whether the configured tier permits the action. Exported for tests. */
@@ -22,6 +22,25 @@ export function actionAllowed(tier, action) {
 export function keyIsBlocked(keys) {
     return KEY_BLOCK.test(keys);
 }
+// Circuit breaker (learned from codex): bound consecutive screen-control failures so the agent can't loop
+// forever on a broken setup. Reset on any success; after FAIL_LIMIT in a row, return a clear stop + how to fix.
+const FAIL_LIMIT = 3;
+let consecFails = 0;
+export function resetComputerFails() {
+    consecFails = 0;
+}
+function ok(msg) {
+    consecFails = 0;
+    return msg;
+}
+function fail(msg) {
+    consecFails += 1;
+    if (consecFails >= FAIL_LIMIT) {
+        consecFails = 0;
+        return `⛔ Stopping screen control — ${FAIL_LIMIT} actions failed in a row (last: ${msg}). Most likely a missing macOS permission (Accessibility for click/type, Screen Recording for screenshots) or the target app isn't reachable. Fix that, then ask me to try again — I won't keep retrying blindly.`;
+    }
+    return `Failed: ${msg}  [${consecFails}/${FAIL_LIMIT} before I stop]`;
+}
 function run(cmd, args) {
     try {
         const r = spawnSync(cmd, args, { encoding: "utf8", timeout: 15000 });
@@ -35,6 +54,23 @@ function has(cmd) {
     return (process.platform === "win32" ? run("where", [cmd]) : run("which", [cmd])).ok;
 }
 const ps = (script) => run("powershell", ["-NoProfile", "-Command", script]);
+/** Put text on the OS clipboard (so `type` can paste it — IME-safe + Unicode-safe, unlike keystroke injection). */
+function setClipboard(text) {
+    try {
+        if (process.platform === "darwin")
+            return spawnSync("pbcopy", [], { input: text, timeout: 5000 }).status === 0;
+        if (process.platform === "win32")
+            return spawnSync("clip", [], { input: text, timeout: 5000 }).status === 0;
+        if (has("wl-copy"))
+            return spawnSync("wl-copy", [], { input: text, timeout: 5000 }).status === 0;
+        if (has("xclip"))
+            return spawnSync("xclip", ["-selection", "clipboard"], { input: text, timeout: 5000 }).status === 0;
+    }
+    catch {
+        /* fall through */
+    }
+    return false;
+}
 let seq = 0;
 function tmpShot() {
     seq += 1;
@@ -73,6 +109,49 @@ function screenshot() {
     }
     return { path: out };
 }
+/** Bring an app to the foreground so screenshots/clicks land on IT, not the terminal hara runs in. */
+function activateApp(app) {
+    if (process.platform === "darwin") {
+        // `open -a` reliably launches+foregrounds; `osascript … activate` often leaves another window on top.
+        const r = run("open", ["-a", app]);
+        return { ok: r.ok, msg: r.ok ? `activated ${app}` : r.out || `couldn't activate ${app}` };
+    }
+    if (process.platform === "win32") {
+        const r = ps(`(New-Object -ComObject WScript.Shell).AppActivate(${JSON.stringify(app)})`);
+        return { ok: r.ok, msg: r.ok ? `activated ${app}` : r.out || `couldn't activate ${app}` };
+    }
+    if (process.platform === "linux") {
+        const r = has("wmctrl") ? run("wmctrl", ["-a", app]) : run("xdotool", ["search", "--name", app, "windowactivate"]);
+        return { ok: r.ok, msg: r.ok ? `activated ${app}` : r.out || `couldn't activate ${app} (need wmctrl/xdotool)` };
+    }
+    return { ok: false, msg: `activate unsupported on ${process.platform}` };
+}
+/** Logical screen size in the coordinate space the click backends use (points on mac, pixels on win/linux).
+ *  Grounding returns 0..1 fractions, so click = fraction × this. null if undetectable. */
+function screenSize() {
+    try {
+        if (process.platform === "darwin") {
+            const r = run("osascript", ["-e", 'tell application "Finder" to get bounds of window of desktop']);
+            const n = r.out.match(/-?\d+/g);
+            if (n && n.length >= 4)
+                return { w: Number(n[2]), h: Number(n[3]) };
+        }
+        else if (process.platform === "linux") {
+            const [w, h] = run("xdotool", ["getdisplaygeometry"]).out.trim().split(/\s+/).map(Number);
+            if (w && h)
+                return { w, h };
+        }
+        else if (process.platform === "win32") {
+            const [w, h] = ps('Add-Type -AssemblyName System.Windows.Forms; $b=[System.Windows.Forms.Screen]::PrimaryScreen.Bounds; "$($b.Width) $($b.Height)"').out.trim().split(/\s+/).map(Number);
+            if (w && h)
+                return { w, h };
+        }
+    }
+    catch {
+        /* fall through */
+    }
+    return null;
+}
 /** Name of the frontmost application/window (for the allowlist check). "" if undetectable. */
 function frontmostApp() {
     if (process.platform === "darwin") {
@@ -128,21 +207,41 @@ using System;using System.Runtime.InteropServices;public class Ms{[DllImport("us
         const text = String(input.text ?? "");
         if (!text)
             return { ok: false, msg: "type needs text" };
+        // IME-safe path: set the clipboard and paste. Keystroke injection (below) is intercepted/garbled by a
+        // CJK input method and can't enter Chinese/emoji reliably; pasting is immune and Unicode-safe.
+        if (setClipboard(text)) {
+            if (mac && has("cliclick")) {
+                const r = run("cliclick", ["kd:cmd", "t:v", "ku:cmd"]); // Cmd+V
+                if (r.ok)
+                    return { ok: true, msg: `pasted ${text.length} chars` };
+            }
+            else if (lin && has("xdotool")) {
+                const r = run("xdotool", ["key", "ctrl+v"]);
+                if (r.ok)
+                    return { ok: true, msg: `pasted ${text.length} chars` };
+            }
+            else if (win) {
+                const r = ps("Add-Type -AssemblyName System.Windows.Forms;[System.Windows.Forms.SendKeys]::SendWait('^v')");
+                if (r.ok)
+                    return { ok: true, msg: `pasted ${text.length} chars` };
+            }
+        }
+        // Fallback: keystroke injection (fine for ASCII when no IME is active).
         if (mac) {
             if (!has("cliclick"))
                 return { ok: false, msg: "cliclick not found — install with `brew install cliclick`" };
             const r = run("cliclick", [`t:${text}`]);
-            return { ok: r.ok, msg: r.ok ? `typed ${text.length} chars` : r.out };
+            return { ok: r.ok, msg: r.ok ? `typed ${text.length} chars (keystroke)` : r.out };
         }
         if (lin) {
             if (!has("xdotool"))
                 return { ok: false, msg: "xdotool not found" };
             const r = run("xdotool", ["type", "--clearmodifiers", text]);
-            return { ok: r.ok, msg: r.ok ? `typed ${text.length} chars` : r.out };
+            return { ok: r.ok, msg: r.ok ? `typed ${text.length} chars (keystroke)` : r.out };
         }
         if (win) {
             const r = ps(`Add-Type -AssemblyName System.Windows.Forms;[System.Windows.Forms.SendKeys]::SendWait(${JSON.stringify(text)})`);
-            return { ok: r.ok, msg: r.ok ? `typed ${text.length} chars` : r.out };
+            return { ok: r.ok, msg: r.ok ? `typed ${text.length} chars (keystroke)` : r.out };
         }
     }
     if (action === "key") {
@@ -182,17 +281,23 @@ export function computerBackends() {
 }
 registerTool({
     name: "computer",
-    description: "Control the screen to operate desktop software (not just the browser): take a screenshot, then " +
-        "click/move/type/press keys at coordinates. Workflow: screenshot → read what's on screen → act. " +
-        "Opt-in and permission-gated (tier + per-app allowlist).",
+    description: "Control the screen to operate desktop software (not just the browser). ALWAYS `activate` the target app " +
+        "FIRST (e.g. activate WeChat) — otherwise screenshots/clicks hit the terminal hara runs in, not the app. " +
+        "Then prefer grounding over guessing pixels: pass `target` (e.g. 'the Send button') to click/move and it's " +
+        "located by a vision model; or `find` to just get coordinates. Workflow: activate → screenshot → click a " +
+        "target → re-screenshot to verify. When typing, type the ACTUAL text — never placeholders. Opt-in and " +
+        "permission-gated (tier + per-app allowlist).",
     input_schema: {
         type: "object",
         properties: {
-            action: { type: "string", enum: ["screenshot", "click", "move", "type", "key"] },
-            x: { type: "number", description: "x pixel (click/move)" },
-            y: { type: "number", description: "y pixel (click/move)" },
+            action: { type: "string", enum: ["screenshot", "activate", "find", "click", "move", "type", "key"] },
+            app: { type: "string", description: "app to bring to the foreground (activate) — e.g. 'WeChat'. Do this BEFORE screenshot/click so they hit the app, not the terminal." },
+            target: { type: "string", description: "describe a UI element to locate (find) or click/move to — e.g. 'the Send button'. Preferred over x,y." },
+            x: { type: "number", description: "x pixel (click/move; or use `target`)" },
+            y: { type: "number", description: "y pixel (click/move; or use `target`)" },
             text: { type: "string", description: "text to type (type)" },
             keys: { type: "string", description: "key or combo, e.g. 'return', 'cmd+c' (key)" },
+            focus: { type: "string", description: "screenshot only: what to look for — focuses the read" },
         },
         required: ["action"],
     },
@@ -205,7 +310,17 @@ registerTool({
         const action = String(input.action ?? "");
         if (!actionAllowed(tier, action))
             return `'${action}' needs a higher tier (current computerUse=${tier}). Raise it with \`hara config set computerUse …\`.`;
-        if (action !== "screenshot") {
+        // Bring the target app to the foreground first — without this, clicks land on the terminal hara runs in.
+        if (action === "activate") {
+            const app = String(input.app ?? input.target ?? "");
+            if (!app)
+                return "activate needs an `app` name (e.g. 'WeChat').";
+            if (!cfg.computerApps.some((a) => app.toLowerCase().includes(a.toLowerCase()) || a.toLowerCase().includes(app.toLowerCase())))
+                return `Refused: "${app}" isn't in your allowlist (${cfg.computerApps.join(", ") || "empty"}). Add it: \`hara config set computerApps "${app}"\`.`;
+            const r = activateApp(app);
+            return r.ok ? ok(`✓ ${r.msg} — now screenshot/find/click to act on it`) : fail(r.msg);
+        }
+        if (action !== "screenshot" && action !== "find") {
             // per-app allowlist: only act when an allowlisted app is frontmost (the key guard against wrong-window clicks)
             if (!cfg.computerApps.length)
                 return "No apps allowlisted — set `hara config set computerApps \"App Name, …\"` before clicking/typing.";
@@ -217,20 +332,45 @@ registerTool({
         if (action === "screenshot") {
             const s = screenshot();
             if (s.error)
-                return `Screenshot failed: ${s.error}`;
+                return fail(`screenshot — ${s.error}`);
             if (ctx.describeImage) {
                 try {
-                    const desc = await ctx.describeImage(s.path);
+                    const desc = await ctx.describeImage(s.path, input.focus ? String(input.focus) : undefined);
                     if (desc)
-                        return `Screenshot (read via vision):\n${desc}`;
+                        return ok(`Screenshot (read via vision):\n${desc}`);
                 }
                 catch {
                     /* fall through to path */
                 }
             }
-            return `Screenshot saved to ${s.path}. Configure a vision model so I can read it: \`hara config set visionModel <model>\`.`;
+            return ok(`Screenshot saved to ${s.path}. Configure a vision model so I can read it: \`hara config set visionModel <model>\`.`);
+        }
+        // Grounding: locate a described element and turn it into screen coordinates (more reliable than guessing
+        // pixels from a text description). Used for `find`, and for click/move when given a `target` and no x,y.
+        const needsLocate = action === "find" || ((action === "click" || action === "move") && input.target != null && (input.x == null || input.y == null));
+        if (needsLocate) {
+            const target = String(input.target ?? "");
+            if (!target)
+                return action === "find" ? "find needs a `target` (what to locate)." : "click/move needs `x,y` or a `target`.";
+            if (!ctx.locate)
+                return "Grounding needs a vision model that can see images — set one: `hara config set visionModel <model>`.";
+            const s = screenshot();
+            if (s.error)
+                return fail(`screenshot — ${s.error}`);
+            const loc = await ctx.locate(s.path, target);
+            if (!loc)
+                return fail(`couldn't locate "${target}" on screen — try a screenshot first, or rephrase the target`);
+            const size = screenSize();
+            if (!size)
+                return fail(`located "${target}" but couldn't read the screen size to convert coordinates`);
+            const gx = Math.round(loc.x * size.w);
+            const gy = Math.round(loc.y * size.h);
+            if (action === "find")
+                return ok(`"${target}" is at ~${gx},${gy} (${Math.round(loc.x * 100)}% across, ${Math.round(loc.y * 100)}% down).`);
+            input.x = gx;
+            input.y = gy;
         }
         const r = pointerOrKeyboard(action, input);
-        return r.ok ? `✓ ${r.msg}` : `Failed: ${r.msg}`;
+        return r.ok ? ok(`✓ ${r.msg}${needsLocate ? ` (located "${input.target}")` : ""}`) : fail(r.msg);
     },
 });

package/dist/tui/App.js CHANGED Viewed

@@ -13,6 +13,7 @@ import { InputBox } from "./InputBox.js";
 import { activity } from "../activity.js";
 import { ctxPctFor } from "../statusbar.js";
 import { accent } from "./theme.js";
+import { renderMarkdown } from "../md.js";
 let _id = 0;
 const nid = () => ++_id;
 const stripAnsi = (s) => s.replace(/\x1b\[[0-9;]*m/g, "");
@@ -21,7 +22,7 @@ function Block({ item, open }) {
         case "user":
             return (_jsxs(Box, { marginTop: 1, children: [_jsx(Text, { color: "cyan", children: "\u203A " }), _jsx(Text, { children: item.text })] }));
         case "assistant":
-            return _jsx(Text, { children: item.text });
+            return _jsx(Text, { children: renderMarkdown(item.text) }); // headers/bold/inline-code/bullets + verbatim fences
         case "reasoning": {
             // fixed-height window: show the last 5 lines while thinking; ctrl-r toggles the full text.
             const lines = item.text.replace(/\n+$/, "").split("\n");
@@ -71,6 +72,9 @@ export function App({ initialStatus, model, cwd, header, onSubmit, cycleApproval
     const [promptSel, setPromptSel] = useState(0);
     const [reasoningOpen, setReasoningOpen] = useState(false);
     const ctrlRef = useRef(null);
+    const queueRef = useRef([]); // type-ahead: FIFO of messages entered while working
+    const [pool, setPool] = useState([]); // type-ahead pool: queued message lines, shown above the input
+    const drainingRef = useRef(false); // idempotency guard so the drain effect can't double-send one item
     const currentRef = useRef([]);
     currentRef.current = current;
     const statusRef = useRef(status);
@@ -90,8 +94,14 @@ export function App({ initialStatus, model, cwd, header, onSubmit, cycleApproval
     }, []);
     const handleSubmit = useCallback(async (line, images) => {
         const t = line.trim();
-        if ((!t && !images?.length) || working || prompt)
-            return; // allow image-only turns
+        if ((!t && !images?.length) || prompt)
+            return; // nothing to send, or a choice is pending
+        if (working) {
+            // type-ahead: hold the message in the pool; all pooled messages are sent together when the turn ends
+            queueRef.current.push({ line, images });
+            setPool(queueRef.current.map((q) => q.line.trim() || "🖼 (image)"));
+            return;
+        }
         setHistory((h) => [...h, { id: nid(), kind: "user", text: t }]); // t already carries any [Image #N] tokens
         const ctrl = new AbortController();
         ctrlRef.current = ctrl;
@@ -130,6 +140,21 @@ export function App({ initialStatus, model, cwd, header, onSubmit, cycleApproval
         setWorking(false);
         ctrlRef.current = null;
     }, [working, prompt, onSubmit, pushCurrent, model, exit]);
+    // Drain the type-ahead pool: when the turn finishes (working → false) and nothing awaits a choice, COALESCE
+    // every pooled message into ONE turn and send it — additions/clarifications go to the agent together, in order.
+    useEffect(() => {
+        if (working || prompt || drainingRef.current || !queueRef.current.length)
+            return;
+        drainingRef.current = true;
+        const batch = queueRef.current;
+        queueRef.current = [];
+        setPool([]);
+        const line = batch.map((b) => b.line).join("\n\n");
+        const images = batch.flatMap((b) => b.images ?? []);
+        void Promise.resolve(handleSubmit(line, images.length ? images : undefined)).finally(() => {
+            drainingRef.current = false;
+        });
+    }, [working, prompt, handleSubmit]);
     useInput((input, key) => {
         if (prompt) {
             const opts = prompt.options;
@@ -145,6 +170,10 @@ export function App({ initialStatus, model, cwd, header, onSubmit, cycleApproval
                 prompt.resolve(opts[opts.length - 1].value); // last option = cancel/no
                 setPrompt(null);
             }
+            else if (/^[1-9]$/.test(input) && Number(input) <= opts.length) {
+                prompt.resolve(opts[Number(input) - 1].value); // type a number to pick directly
+                setPrompt(null);
+            }
             else if (input) {
                 const hit = opts.find((o) => o.key && o.key === input.toLowerCase());
                 if (hit) {
@@ -156,10 +185,16 @@ export function App({ initialStatus, model, cwd, header, onSubmit, cycleApproval
         }
         if (key.ctrl && input === "r")
             return setReasoningOpen((x) => !x);
-        if (key.escape && working)
+        if (key.escape && working) {
+            // Esc = stop everything: abort the turn AND drop any type-ahead (a stopped turn shouldn't fire queued msgs)
+            if (queueRef.current.length) {
+                queueRef.current = [];
+                setPool([]);
+            }
             ctrlRef.current?.abort();
+        }
         else if (key.tab && key.shift && cycleApproval)
             setStatus((s) => ({ ...s, approval: cycleApproval(s.approval) }));
     });
-    return (_jsxs(Box, { flexDirection: "column", children: [_jsx(Static, { items: header ? [{ id: -1, kind: "notice", text: "" }, ...history] : history, children: (item) => (item.id === -1 ? _jsx(HeaderCard, { ...header }, "hdr") : _jsx(Block, { item: item }, item.id)) }), current.map((item) => (_jsx(Block, { item: item, open: reasoningOpen }, item.id))), working && !prompt && _jsx(Working, {}), prompt && (_jsxs(Box, { flexDirection: "column", marginTop: 1, children: [_jsx(Text, { color: "yellow", children: `  ${stripAnsi(prompt.title)}` }), prompt.options.map((o, i) => (_jsx(Text, { color: i === promptSel ? "cyan" : undefined, bold: i === promptSel, children: (i === promptSel ? " ❯ " : "   ") + o.label }, i)))] })), _jsx(InputBox, { status: status, cwd: cwd, isActive: !working && !prompt, onSubmit: handleSubmit, onClipboardImage: onClipboardImage })] }));
+    return (_jsxs(Box, { flexDirection: "column", children: [_jsx(Static, { items: header ? [{ id: -1, kind: "notice", text: "" }, ...history] : history, children: (item) => (item.id === -1 ? _jsx(HeaderCard, { ...header }, "hdr") : _jsx(Block, { item: item }, item.id)) }), current.map((item) => (_jsx(Block, { item: item, open: reasoningOpen }, item.id))), working && !prompt && _jsx(Working, {}), prompt && (_jsxs(Box, { flexDirection: "column", marginTop: 1, children: [_jsx(Text, { color: "yellow", children: `  ${stripAnsi(prompt.title)}` }), prompt.options.map((o, i) => (_jsx(Text, { color: i === promptSel ? "cyan" : undefined, bold: i === promptSel, children: (i === promptSel ? " ❯ " : "   ") + `${i + 1}. ` + o.label }, i))), _jsx(Text, { dimColor: true, children: `   ↑↓ or 1–${prompt.options.length} to choose · Enter · Esc cancels` })] })), pool.length > 0 && !prompt && (_jsx(Box, { flexDirection: "column", children: pool.map((l, i) => (_jsx(Text, { color: accent(), children: `  › ${l.length > 72 ? l.slice(0, 72) + "…" : l}` }, i))) })), _jsx(InputBox, { status: status, cwd: cwd, isActive: !prompt, working: working, queued: pool.length, onSubmit: handleSubmit, onClipboardImage: onClipboardImage })] }));
 }

package/dist/tui/InputBox.js CHANGED Viewed

@@ -84,7 +84,7 @@ function InputLine({ value, cursor }) {
     return _jsx(Text, { children: nodes });
 }
 /** Top border (session) + prompt line + bottom border (usage) + ModeBar, with an @path popup. */
-export function InputBox({ status, cwd, width, onSubmit, onClipboardImage, isActive = true, placeholder = "Type a task · /help · @file · Ctrl+V paste image · shift+tab mode · Esc interrupts", }) {
+export function InputBox({ status, cwd, width, onSubmit, onClipboardImage, isActive = true, working = false, queued = 0, placeholder = "Type a task · /help · @file · Ctrl+V paste image · shift+tab mode · Esc interrupts", }) {
     const { stdout } = useStdout();
     const w = width ?? stdout?.columns ?? 80;
     const [value, setValue] = useState("");
@@ -204,5 +204,5 @@ export function InputBox({ status, cwd, width, onSubmit, onClipboardImage, isAct
             set(value.slice(0, cursor) + input + value.slice(cursor), cursor + input.length);
         }
     }, { isActive });
-    return (_jsxs(Box, { flexDirection: "column", children: [_jsx(TopBorder, { name: status.sessionName || "session", width: w }), _jsxs(Box, { children: [_jsx(Text, { color: "cyan", children: "› " }), value.length === 0 ? (_jsxs(Text, { children: [_jsx(Text, { inverse: true, children: " " }), _jsx(Text, { dimColor: true, children: placeholder })] })) : (_jsx(InputLine, { value: value, cursor: cursor }))] }), _jsx(BottomBorder, { s: status, width: w }), popupOpen ? _jsx(MentionPopup, { items: candidates, selected: selIdx, query: mention.query }) : null, _jsx(ModeBar, { approval: status.approval })] }));
+    return (_jsxs(Box, { flexDirection: "column", children: [_jsx(TopBorder, { name: status.sessionName || "session", width: w }), _jsxs(Box, { children: [_jsx(Text, { color: "cyan", children: "› " }), value.length === 0 ? (_jsxs(Text, { children: [_jsx(Text, { inverse: true, children: " " }), _jsx(Text, { dimColor: true, children: placeholder })] })) : (_jsx(InputLine, { value: value, cursor: cursor }))] }), _jsx(BottomBorder, { s: status, width: w }), working ? _jsx(Text, { dimColor: true, children: `  ⌨ working — Enter queues your message${queued ? ` · ${queued} queued` : ""} · Esc interrupts` }) : null, popupOpen ? _jsx(MentionPopup, { items: candidates, selected: selIdx, query: mention.query }) : null, _jsx(ModeBar, { approval: status.approval })] }));
 }

package/dist/vision.js CHANGED Viewed

@@ -65,12 +65,61 @@ export const DESCRIBE_SYSTEM = [
     "4. Quote any error or warning messages exactly.",
     "5. Be thorough and factual; do not speculate beyond what is visible.",
 ].join("\n");
+// Screenshot variant — tuned for driving the desktop (RPA) rather than transcription. A text-only main
+// model can't see, so it needs *actionable* output: where things are, so it can issue clicks.
+export const SCREENSHOT_SYSTEM = [
+    "You are the eyes of an assistant operating this computer; it cannot see the screen and acts only on your",
+    "words. Describe the screenshot so it can ACT. Prioritise, in order:",
+    "1. INTERACTIVE elements — buttons, links, text fields, checkboxes, menus, tabs, icons — each with its",
+    "   visible label and an approximate location: a region (e.g. top-right) AND a rough pixel x,y if you can.",
+    "2. The currently focused/active element or selection, and any open dialog/modal/popup.",
+    "3. Errors, warnings, and key visible text/headings — quote them exactly.",
+    "4. One line on what app/screen this appears to be.",
+    "Positions guide clicks, so always estimate them. Be concise and factual; never invent elements.",
+].join("\n");
+// Grounding — ask a vision model WHERE a UI element is (for accurate RPA clicking), as resolution-independent
+// fractions so it works regardless of Retina/DPI scaling.
+export const LOCATE_SYSTEM = [
+    "You are given a screenshot. The user names ONE UI element (button, field, icon, menu item, link).",
+    "Return ONLY its CENTER as JSON: {\"x\": <0-1000>, \"y\": <0-1000>}, where x is the position as per-mille of",
+    "the image WIDTH (0=left, 1000=right) and y as per-mille of the HEIGHT (0=top, 1000=bottom).",
+    "If the element is not visible, return {\"x\": -1, \"y\": -1}. Output ONLY the JSON, nothing else.",
+].join("\n");
+/** Parse a grounding reply → {x,y} as 0..1 fractions (accepts per-mille / percent / fraction), or null. */
+export function parseLocate(text) {
+    const m = text.match(/"x"\s*:\s*(-?\d+(?:\.\d+)?)[\s,}]+.*?"y"\s*:\s*(-?\d+(?:\.\d+)?)/s) || text.match(/(-?\d+(?:\.\d+)?)\s*[,\s]\s*(-?\d+(?:\.\d+)?)/);
+    if (!m)
+        return null;
+    let x = Number(m[1]);
+    let y = Number(m[2]);
+    if (x < 0 || y < 0 || Number.isNaN(x) || Number.isNaN(y))
+        return null; // not found / unparseable
+    const norm = (v) => (v > 100 ? v / 1000 : v > 1.5 ? v / 100 : v); // per-mille | percent | fraction → 0..1
+    x = Math.min(1, Math.max(0, norm(x)));
+    y = Math.min(1, Math.max(0, norm(y)));
+    return { x, y };
+}
+/** Send a screenshot to a (grounding-capable) vision model and get the target's center as 0..1 fractions. */
+export async function locateImage(provider, image, target, opts = {}) {
+    const r = await provider.turn({
+        system: LOCATE_SYSTEM,
+        history: [{ role: "user", content: `Locate this element: ${target}`, images: [image] }],
+        tools: [],
+        onText: () => { },
+        signal: opts.signal,
+    });
+    if (r.stop === "error")
+        return null;
+    return parseLocate(r.text);
+}
 const PROMPT = "Describe the attached image(s) per your instructions.";
-/** Send images to the vision provider and return its textual description. Throws on a provider error. */
+/** Send images to the vision provider and return its textual description. Throws on a provider error.
+ *  `system` overrides the default prompt (e.g. SCREENSHOT_SYSTEM); `hint` focuses it on a specific goal. */
 export async function describeImages(provider, images, opts = {}) {
+    const content = opts.hint ? `${PROMPT}\nFocus especially on: ${opts.hint}` : PROMPT;
     const r = await provider.turn({
-        system: DESCRIBE_SYSTEM,
-        history: [{ role: "user", content: PROMPT, images }],
+        system: opts.system ?? DESCRIBE_SYSTEM,
+        history: [{ role: "user", content, images }],
         tools: [],
         onText: () => { },
         signal: opts.signal,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@nanhara/hara",
-  "version": "0.33.0",
+  "version": "0.48.0",
   "description": "hara — a coding agent CLI that runs like an engineering org.",
   "bin": {
     "hara": "dist/index.js"
@@ -11,7 +11,8 @@
     "README.md",
     "CHANGELOG.md",
     "LICENSE",
-    "CLA.md"
+    "CLA.md",
+    "plugins"
   ],
   "keywords": [
     "ai",

package/plugins/browser/.hara-plugin/plugin.json ADDED Viewed

@@ -0,0 +1,9 @@
+{
+  "name": "browser",
+  "version": "0.1.0",
+  "description": "Reliable web automation for hara via the Playwright MCP — acts on the DOM/accessibility tree (selectors, auto-wait), not pixels. navigate / click / type / fill / snapshot.",
+  "skills": ["skills"],
+  "mcpServers": {
+    "browser": { "command": "npx", "args": ["-y", "@playwright/mcp@latest"] }
+  }
+}

package/plugins/browser/skills/web/SKILL.md ADDED Viewed

@@ -0,0 +1,27 @@
+---
+name: web-automation
+description: Operate web pages reliably — navigate, click, fill forms, log in, extract — via the Playwright MCP. Acts on the DOM/accessibility tree by selector/role (deterministic, auto-waiting), NOT screenshots or pixel coordinates. Far more reliable than desktop screen control.
+when_to_use: when the user wants to do anything on a website — open a page, click, fill/submit a form, log in, scrape data, automate a web flow.
+---
+# Web automation (Playwright MCP)
+Reliable browser tools are available as `mcp__browser__*` (navigate, snapshot, click, type, fill_form,
+select_option, evaluate, …). They act on the page's **accessibility tree by element ref/role/text** — not
+screenshots or pixel coordinates — so they're deterministic and auto-wait for elements. This is the reliable
+counterpart to the fragile desktop `computer` tool: prefer it for anything on the web.
+## Workflow
+1. `browser_navigate` to the URL.
+2. `browser_snapshot` — read the accessibility tree (elements + their `ref`s). This is your "eyes": use the
+   refs to act precisely. Prefer it over a screenshot.
+3. Act by ref/role/text: `browser_click`, `browser_type`, `browser_fill_form`, `browser_select_option`.
+4. `browser_snapshot` again to verify before the next step.
+## Notes
+- First run downloads a browser once: `npx playwright install chromium`.
+- The Playwright MCP uses its **own** browser (no logins). For tasks needing your **real logged-in Chrome**, use
+  `chrome-devtools-mcp` instead (drives your actual Chrome via CDP) — swap the mcpServers command to
+  `npx chrome-devtools-mcp@latest`. (This is what openclaw/cc-haha use.)
+- **Confirm before irreversible actions** — purchases, posting, sending messages, deleting. Verify the page/state
+  with a snapshot first.

package/plugins/chrome/.hara-plugin/plugin.json ADDED Viewed

@@ -0,0 +1,9 @@
+{
+  "name": "chrome",
+  "version": "0.1.0",
+  "description": "Drive a real, persistent-login Chrome from hara via chrome-devtools-mcp (CDP) — for web tasks on sites you're already signed into (logins persist across runs). Alternative to the `browser` plugin's isolated Playwright browser — enable one, not both.",
+  "skills": ["skills"],
+  "mcpServers": {
+    "chrome": { "command": "npx", "args": ["-y", "chrome-devtools-mcp@latest"] }
+  }
+}