npm - @nanhara/hara - Versions diffs - 0.33.0 → 0.53.0 - Mend

@nanhara/hara 0.33.0 → 0.53.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/CHANGELOG.md +216 -1
package/README.md +15 -4
package/dist/agent/loop.js +16 -1
package/dist/config.js +4 -2
package/dist/hooks.js +64 -0
package/dist/index.js +331 -77
package/dist/notify.js +42 -0
package/dist/org/planner.js +19 -0
package/dist/plugins/plugins.js +14 -0
package/dist/providers/anthropic.js +21 -11
package/dist/search/semindex.js +62 -11
package/dist/session/store.js +14 -0
package/dist/tools/computer.js +156 -16
package/dist/tools/todo.js +51 -0
package/dist/tools/web.js +97 -0
package/dist/tui/App.js +55 -7
package/dist/tui/InputBox.js +2 -2
package/dist/vision.js +52 -3
package/package.json +3 -2
package/plugins/browser/.hara-plugin/plugin.json +9 -0
package/plugins/browser/skills/web/SKILL.md +27 -0
package/plugins/chrome/.hara-plugin/plugin.json +9 -0
package/plugins/chrome/skills/chrome/SKILL.md +26 -0

package/dist/tui/App.js CHANGED Viewed

@@ -13,6 +13,7 @@ import { InputBox } from "./InputBox.js";
 import { activity } from "../activity.js";
 import { ctxPctFor } from "../statusbar.js";
 import { accent } from "./theme.js";
+import { renderMarkdown } from "../md.js";
 let _id = 0;
 const nid = () => ++_id;
 const stripAnsi = (s) => s.replace(/\x1b\[[0-9;]*m/g, "");
@@ -21,7 +22,7 @@ function Block({ item, open }) {
         case "user":
             return (_jsxs(Box, { marginTop: 1, children: [_jsx(Text, { color: "cyan", children: "\u203A " }), _jsx(Text, { children: item.text })] }));
         case "assistant":
-            return _jsx(Text, { children: item.text });
+            return _jsx(Text, { children: renderMarkdown(item.text) }); // headers/bold/inline-code/bullets + verbatim fences
         case "reasoning": {
             // fixed-height window: show the last 5 lines while thinking; ctrl-r toggles the full text.
             const lines = item.text.replace(/\n+$/, "").split("\n");
@@ -71,6 +72,9 @@ export function App({ initialStatus, model, cwd, header, onSubmit, cycleApproval
     const [promptSel, setPromptSel] = useState(0);
     const [reasoningOpen, setReasoningOpen] = useState(false);
     const ctrlRef = useRef(null);
+    const queueRef = useRef([]); // type-ahead: FIFO of messages entered while working
+    const [pool, setPool] = useState([]); // type-ahead pool: queued message lines, shown above the input
+    const drainingRef = useRef(false); // idempotency guard so the drain effect can't double-send one item
     const currentRef = useRef([]);
     currentRef.current = current;
     const statusRef = useRef(status);
@@ -88,10 +92,29 @@ export function App({ initialStatus, model, cwd, header, onSubmit, cycleApproval
             return [...cur, { id: nid(), kind, text }];
         });
     }, []);
+    // Type-ahead steering: hand the runner everything queued while the turn ran, showing each message
+    // inline (as a user block) at the point it gets folded into the conversation. Drained mid-turn so an
+    // addition reaches the model on its next call; whatever's still queued at turn end is the effect below.
+    const drainQueue = useCallback(() => {
+        if (!queueRef.current.length)
+            return [];
+        const batch = queueRef.current;
+        queueRef.current = [];
+        setPool([]);
+        for (const b of batch)
+            pushCurrent("user", b.line.trim() || "🖼 (image)");
+        return batch;
+    }, [pushCurrent]);
     const handleSubmit = useCallback(async (line, images) => {
         const t = line.trim();
-        if ((!t && !images?.length) || working || prompt)
-            return; // allow image-only turns
+        if ((!t && !images?.length) || prompt)
+            return; // nothing to send, or a choice is pending
+        if (working) {
+            // type-ahead: hold the message in the pool; all pooled messages are sent together when the turn ends
+            queueRef.current.push({ line, images });
+            setPool(queueRef.current.map((q) => q.line.trim() || "🖼 (image)"));
+            return;
+        }
         setHistory((h) => [...h, { id: nid(), kind: "user", text: t }]); // t already carries any [Image #N] tokens
         const ctrl = new AbortController();
         ctrlRef.current = ctrl;
@@ -117,7 +140,7 @@ export function App({ initialStatus, model, cwd, header, onSubmit, cycleApproval
         const selectFn = (title, options) => openPrompt(title, options);
         const setApprovalFn = (m) => setStatus((s) => ({ ...s, approval: m }));
         try {
-            await onSubmit(t, { sink, confirm: confirmFn, select: selectFn, setApproval: setApprovalFn, signal: ctrl.signal, exit, approval: statusRef.current.approval }, images);
+            await onSubmit(t, { sink, confirm: confirmFn, select: selectFn, setApproval: setApprovalFn, signal: ctrl.signal, exit, approval: statusRef.current.approval, drainQueue }, images);
         }
         catch (e) {
             pushCurrent("notice", `error: ${e instanceof Error ? e.message : String(e)}`);
@@ -129,7 +152,22 @@ export function App({ initialStatus, model, cwd, header, onSubmit, cycleApproval
         setCurrent([]);
         setWorking(false);
         ctrlRef.current = null;
-    }, [working, prompt, onSubmit, pushCurrent, model, exit]);
+    }, [working, prompt, onSubmit, pushCurrent, model, exit, drainQueue]);
+    // Drain the type-ahead pool: when the turn finishes (working → false) and nothing awaits a choice, COALESCE
+    // every pooled message into ONE turn and send it — additions/clarifications go to the agent together, in order.
+    useEffect(() => {
+        if (working || prompt || drainingRef.current || !queueRef.current.length)
+            return;
+        drainingRef.current = true;
+        const batch = queueRef.current;
+        queueRef.current = [];
+        setPool([]);
+        const line = batch.map((b) => b.line).join("\n\n");
+        const images = batch.flatMap((b) => b.images ?? []);
+        void Promise.resolve(handleSubmit(line, images.length ? images : undefined)).finally(() => {
+            drainingRef.current = false;
+        });
+    }, [working, prompt, handleSubmit]);
     useInput((input, key) => {
         if (prompt) {
             const opts = prompt.options;
@@ -145,6 +183,10 @@ export function App({ initialStatus, model, cwd, header, onSubmit, cycleApproval
                 prompt.resolve(opts[opts.length - 1].value); // last option = cancel/no
                 setPrompt(null);
             }
+            else if (/^[1-9]$/.test(input) && Number(input) <= opts.length) {
+                prompt.resolve(opts[Number(input) - 1].value); // type a number to pick directly
+                setPrompt(null);
+            }
             else if (input) {
                 const hit = opts.find((o) => o.key && o.key === input.toLowerCase());
                 if (hit) {
@@ -156,10 +198,16 @@ export function App({ initialStatus, model, cwd, header, onSubmit, cycleApproval
         }
         if (key.ctrl && input === "r")
             return setReasoningOpen((x) => !x);
-        if (key.escape && working)
+        if (key.escape && working) {
+            // Esc = stop everything: abort the turn AND drop any type-ahead (a stopped turn shouldn't fire queued msgs)
+            if (queueRef.current.length) {
+                queueRef.current = [];
+                setPool([]);
+            }
             ctrlRef.current?.abort();
+        }
         else if (key.tab && key.shift && cycleApproval)
             setStatus((s) => ({ ...s, approval: cycleApproval(s.approval) }));
     });
-    return (_jsxs(Box, { flexDirection: "column", children: [_jsx(Static, { items: header ? [{ id: -1, kind: "notice", text: "" }, ...history] : history, children: (item) => (item.id === -1 ? _jsx(HeaderCard, { ...header }, "hdr") : _jsx(Block, { item: item }, item.id)) }), current.map((item) => (_jsx(Block, { item: item, open: reasoningOpen }, item.id))), working && !prompt && _jsx(Working, {}), prompt && (_jsxs(Box, { flexDirection: "column", marginTop: 1, children: [_jsx(Text, { color: "yellow", children: `  ${stripAnsi(prompt.title)}` }), prompt.options.map((o, i) => (_jsx(Text, { color: i === promptSel ? "cyan" : undefined, bold: i === promptSel, children: (i === promptSel ? " ❯ " : "   ") + o.label }, i)))] })), _jsx(InputBox, { status: status, cwd: cwd, isActive: !working && !prompt, onSubmit: handleSubmit, onClipboardImage: onClipboardImage })] }));
+    return (_jsxs(Box, { flexDirection: "column", children: [_jsx(Static, { items: header ? [{ id: -1, kind: "notice", text: "" }, ...history] : history, children: (item) => (item.id === -1 ? _jsx(HeaderCard, { ...header }, "hdr") : _jsx(Block, { item: item }, item.id)) }), current.map((item) => (_jsx(Block, { item: item, open: reasoningOpen }, item.id))), working && !prompt && _jsx(Working, {}), prompt && (_jsxs(Box, { flexDirection: "column", marginTop: 1, children: [_jsx(Text, { color: "yellow", children: `  ${stripAnsi(prompt.title)}` }), prompt.options.map((o, i) => (_jsx(Text, { color: i === promptSel ? "cyan" : undefined, bold: i === promptSel, children: (i === promptSel ? " ❯ " : "   ") + `${i + 1}. ` + o.label }, i))), _jsx(Text, { dimColor: true, children: `   ↑↓ or 1–${prompt.options.length} to choose · Enter · Esc cancels` })] })), pool.length > 0 && !prompt && (_jsx(Box, { flexDirection: "column", children: pool.map((l, i) => (_jsx(Text, { color: accent(), children: `  › ${l.length > 72 ? l.slice(0, 72) + "…" : l}` }, i))) })), _jsx(InputBox, { status: status, cwd: cwd, isActive: !prompt, working: working, queued: pool.length, onSubmit: handleSubmit, onClipboardImage: onClipboardImage })] }));
 }

package/dist/tui/InputBox.js CHANGED Viewed

@@ -84,7 +84,7 @@ function InputLine({ value, cursor }) {
     return _jsx(Text, { children: nodes });
 }
 /** Top border (session) + prompt line + bottom border (usage) + ModeBar, with an @path popup. */
-export function InputBox({ status, cwd, width, onSubmit, onClipboardImage, isActive = true, placeholder = "Type a task · /help · @file · Ctrl+V paste image · shift+tab mode · Esc interrupts", }) {
+export function InputBox({ status, cwd, width, onSubmit, onClipboardImage, isActive = true, working = false, queued = 0, placeholder = "Type a task · /help · @file · Ctrl+V paste image · shift+tab mode · Esc interrupts", }) {
     const { stdout } = useStdout();
     const w = width ?? stdout?.columns ?? 80;
     const [value, setValue] = useState("");
@@ -204,5 +204,5 @@ export function InputBox({ status, cwd, width, onSubmit, onClipboardImage, isAct
             set(value.slice(0, cursor) + input + value.slice(cursor), cursor + input.length);
         }
     }, { isActive });
-    return (_jsxs(Box, { flexDirection: "column", children: [_jsx(TopBorder, { name: status.sessionName || "session", width: w }), _jsxs(Box, { children: [_jsx(Text, { color: "cyan", children: "› " }), value.length === 0 ? (_jsxs(Text, { children: [_jsx(Text, { inverse: true, children: " " }), _jsx(Text, { dimColor: true, children: placeholder })] })) : (_jsx(InputLine, { value: value, cursor: cursor }))] }), _jsx(BottomBorder, { s: status, width: w }), popupOpen ? _jsx(MentionPopup, { items: candidates, selected: selIdx, query: mention.query }) : null, _jsx(ModeBar, { approval: status.approval })] }));
+    return (_jsxs(Box, { flexDirection: "column", children: [_jsx(TopBorder, { name: status.sessionName || "session", width: w }), _jsxs(Box, { children: [_jsx(Text, { color: "cyan", children: "› " }), value.length === 0 ? (_jsxs(Text, { children: [_jsx(Text, { inverse: true, children: " " }), _jsx(Text, { dimColor: true, children: placeholder })] })) : (_jsx(InputLine, { value: value, cursor: cursor }))] }), _jsx(BottomBorder, { s: status, width: w }), working ? _jsx(Text, { dimColor: true, children: `  ⌨ working — Enter queues your message${queued ? ` · ${queued} queued` : ""} · Esc interrupts` }) : null, popupOpen ? _jsx(MentionPopup, { items: candidates, selected: selIdx, query: mention.query }) : null, _jsx(ModeBar, { approval: status.approval })] }));
 }

package/dist/vision.js CHANGED Viewed

@@ -65,12 +65,61 @@ export const DESCRIBE_SYSTEM = [
     "4. Quote any error or warning messages exactly.",
     "5. Be thorough and factual; do not speculate beyond what is visible.",
 ].join("\n");
+// Screenshot variant — tuned for driving the desktop (RPA) rather than transcription. A text-only main
+// model can't see, so it needs *actionable* output: where things are, so it can issue clicks.
+export const SCREENSHOT_SYSTEM = [
+    "You are the eyes of an assistant operating this computer; it cannot see the screen and acts only on your",
+    "words. Describe the screenshot so it can ACT. Prioritise, in order:",
+    "1. INTERACTIVE elements — buttons, links, text fields, checkboxes, menus, tabs, icons — each with its",
+    "   visible label and an approximate location: a region (e.g. top-right) AND a rough pixel x,y if you can.",
+    "2. The currently focused/active element or selection, and any open dialog/modal/popup.",
+    "3. Errors, warnings, and key visible text/headings — quote them exactly.",
+    "4. One line on what app/screen this appears to be.",
+    "Positions guide clicks, so always estimate them. Be concise and factual; never invent elements.",
+].join("\n");
+// Grounding — ask a vision model WHERE a UI element is (for accurate RPA clicking), as resolution-independent
+// fractions so it works regardless of Retina/DPI scaling.
+export const LOCATE_SYSTEM = [
+    "You are given a screenshot. The user names ONE UI element (button, field, icon, menu item, link).",
+    "Return ONLY its CENTER as JSON: {\"x\": <0-1000>, \"y\": <0-1000>}, where x is the position as per-mille of",
+    "the image WIDTH (0=left, 1000=right) and y as per-mille of the HEIGHT (0=top, 1000=bottom).",
+    "If the element is not visible, return {\"x\": -1, \"y\": -1}. Output ONLY the JSON, nothing else.",
+].join("\n");
+/** Parse a grounding reply → {x,y} as 0..1 fractions (accepts per-mille / percent / fraction), or null. */
+export function parseLocate(text) {
+    const m = text.match(/"x"\s*:\s*(-?\d+(?:\.\d+)?)[\s,}]+.*?"y"\s*:\s*(-?\d+(?:\.\d+)?)/s) || text.match(/(-?\d+(?:\.\d+)?)\s*[,\s]\s*(-?\d+(?:\.\d+)?)/);
+    if (!m)
+        return null;
+    let x = Number(m[1]);
+    let y = Number(m[2]);
+    if (x < 0 || y < 0 || Number.isNaN(x) || Number.isNaN(y))
+        return null; // not found / unparseable
+    const norm = (v) => (v > 100 ? v / 1000 : v > 1.5 ? v / 100 : v); // per-mille | percent | fraction → 0..1
+    x = Math.min(1, Math.max(0, norm(x)));
+    y = Math.min(1, Math.max(0, norm(y)));
+    return { x, y };
+}
+/** Send a screenshot to a (grounding-capable) vision model and get the target's center as 0..1 fractions. */
+export async function locateImage(provider, image, target, opts = {}) {
+    const r = await provider.turn({
+        system: LOCATE_SYSTEM,
+        history: [{ role: "user", content: `Locate this element: ${target}`, images: [image] }],
+        tools: [],
+        onText: () => { },
+        signal: opts.signal,
+    });
+    if (r.stop === "error")
+        return null;
+    return parseLocate(r.text);
+}
 const PROMPT = "Describe the attached image(s) per your instructions.";
-/** Send images to the vision provider and return its textual description. Throws on a provider error. */
+/** Send images to the vision provider and return its textual description. Throws on a provider error.
+ *  `system` overrides the default prompt (e.g. SCREENSHOT_SYSTEM); `hint` focuses it on a specific goal. */
 export async function describeImages(provider, images, opts = {}) {
+    const content = opts.hint ? `${PROMPT}\nFocus especially on: ${opts.hint}` : PROMPT;
     const r = await provider.turn({
-        system: DESCRIBE_SYSTEM,
-        history: [{ role: "user", content: PROMPT, images }],
+        system: opts.system ?? DESCRIBE_SYSTEM,
+        history: [{ role: "user", content, images }],
         tools: [],
         onText: () => { },
         signal: opts.signal,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@nanhara/hara",
-  "version": "0.33.0",
+  "version": "0.53.0",
   "description": "hara — a coding agent CLI that runs like an engineering org.",
   "bin": {
     "hara": "dist/index.js"
@@ -11,7 +11,8 @@
     "README.md",
     "CHANGELOG.md",
     "LICENSE",
-    "CLA.md"
+    "CLA.md",
+    "plugins"
   ],
   "keywords": [
     "ai",

package/plugins/browser/.hara-plugin/plugin.json ADDED Viewed

@@ -0,0 +1,9 @@
+{
+  "name": "browser",
+  "version": "0.1.0",
+  "description": "Reliable web automation for hara via the Playwright MCP — acts on the DOM/accessibility tree (selectors, auto-wait), not pixels. navigate / click / type / fill / snapshot.",
+  "skills": ["skills"],
+  "mcpServers": {
+    "browser": { "command": "npx", "args": ["-y", "@playwright/mcp@latest"] }
+  }
+}

package/plugins/browser/skills/web/SKILL.md ADDED Viewed

@@ -0,0 +1,27 @@
+---
+name: web-automation
+description: Operate web pages reliably — navigate, click, fill forms, log in, extract — via the Playwright MCP. Acts on the DOM/accessibility tree by selector/role (deterministic, auto-waiting), NOT screenshots or pixel coordinates. Far more reliable than desktop screen control.
+when_to_use: when the user wants to do anything on a website — open a page, click, fill/submit a form, log in, scrape data, automate a web flow.
+---
+# Web automation (Playwright MCP)
+Reliable browser tools are available as `mcp__browser__*` (navigate, snapshot, click, type, fill_form,
+select_option, evaluate, …). They act on the page's **accessibility tree by element ref/role/text** — not
+screenshots or pixel coordinates — so they're deterministic and auto-wait for elements. This is the reliable
+counterpart to the fragile desktop `computer` tool: prefer it for anything on the web.
+## Workflow
+1. `browser_navigate` to the URL.
+2. `browser_snapshot` — read the accessibility tree (elements + their `ref`s). This is your "eyes": use the
+   refs to act precisely. Prefer it over a screenshot.
+3. Act by ref/role/text: `browser_click`, `browser_type`, `browser_fill_form`, `browser_select_option`.
+4. `browser_snapshot` again to verify before the next step.
+## Notes
+- First run downloads a browser once: `npx playwright install chromium`.
+- The Playwright MCP uses its **own** browser (no logins). For tasks needing your **real logged-in Chrome**, use
+  `chrome-devtools-mcp` instead (drives your actual Chrome via CDP) — swap the mcpServers command to
+  `npx chrome-devtools-mcp@latest`. (This is what openclaw/cc-haha use.)
+- **Confirm before irreversible actions** — purchases, posting, sending messages, deleting. Verify the page/state
+  with a snapshot first.

package/plugins/chrome/.hara-plugin/plugin.json ADDED Viewed

@@ -0,0 +1,9 @@
+{
+  "name": "chrome",
+  "version": "0.1.0",
+  "description": "Drive a real, persistent-login Chrome from hara via chrome-devtools-mcp (CDP) — for web tasks on sites you're already signed into (logins persist across runs). Alternative to the `browser` plugin's isolated Playwright browser — enable one, not both.",
+  "skills": ["skills"],
+  "mcpServers": {
+    "chrome": { "command": "npx", "args": ["-y", "chrome-devtools-mcp@latest"] }
+  }
+}

package/plugins/chrome/skills/chrome/SKILL.md ADDED Viewed

@@ -0,0 +1,26 @@
+---
+name: chrome-control
+description: Operate a REAL Chrome (with your persistent logins) for web tasks on signed-in sites — via chrome-devtools-mcp (Chrome DevTools Protocol). Use instead of the isolated Playwright `browser` plugin when the task needs your existing accounts/sessions.
+when_to_use: when a web task must run on a site you're logged into (your dashboards, accounts, web apps) rather than a fresh anonymous browser.
+---
+# Chrome (real, logged-in) via chrome-devtools-mcp
+Tools appear as `mcp__chrome__*` (navigate, click, fill, snapshot, network, performance…). Same
+DOM/accessibility-tree reliability as the `browser` plugin, but it drives a **real Chrome with a persistent
+profile** — log into a site once and the session is remembered across runs.
+## Modes
+- **Persistent profile (default):** `npx chrome-devtools-mcp@latest` launches Chrome with a saved profile at
+  `~/.cache/chrome-devtools-mcp/chrome-profile`. Log in once; it persists. Good default.
+- **Attach to YOUR running Chrome:** launch Chrome with `--remote-debugging-port=9222`, then set the MCP command
+  to `npx chrome-devtools-mcp@latest --browserUrl http://127.0.0.1:9222` — hara then drives your actual browser
+  and all its logins.
+## Enable (alternative to `browser`, not both)
+Running two browser MCPs at once is confusing. To switch from the default Playwright `browser`:
+`hara plugin add file:<repo>/plugins/chrome && hara plugin disable browser`.
+## Caution
+This controls a **real** browser session. Confirm before destructive/irreversible actions (purchases, posting,
+sending, deleting); take a snapshot to verify the page/state first.