npm - 0agent - Versions diffs - 1.0.55 → 1.0.57 - Mend

0agent 1.0.55 → 1.0.57

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/daemon.mjs +31 -9
package/package.json +1 -1

package/dist/daemon.mjs CHANGED Viewed

@@ -2135,10 +2135,10 @@ var init_BrowserCapability = __esm({
     "use strict";
     BrowserCapability = class {
       name = "browser_open";
-      description = "Open a URL in the system browser or extract page content. Use when scrape_url fails on JS-heavy pages.";
+      description = "Headless browser for scraping JS-heavy pages. NOT for user-facing browser automation.";
       toolDefinition = {
         name: "browser_open",
-        description: `Open or read a URL using the system browser. Use action="open" to launch the URL visibly in the user's default browser. Use action="read" (default) to extract page content headlessly. Use when scrape_url fails on JS-heavy pages.`,
+        description: `Headless browser \u2014 ONLY for reading/scraping page content when scrape_url fails on JS-heavy pages. action="read" (default): extract text headlessly (invisible, no real browser window opened). NEVER use this when the task involves the user's real browser or visible UI \u2014 use gui_automation with open_url instead. Do NOT use alongside gui_automation for the same URL \u2014 pick one.`,
         input_schema: {
           type: "object",
           properties: {
@@ -2599,13 +2599,13 @@ var init_GUICapability = __esm({
       description = "Automate desktop GUI \u2014 click, type, screenshot, hotkeys, find text on screen.";
       toolDefinition = {
         name: "gui_automation",
-        description: 'Automate desktop GUI interactions. Take screenshots to see the current screen state, click on buttons/links/fields, type text, press keyboard shortcuts, scroll, open apps. IMPORTANT: Limit screenshots to at most 3 per task \u2014 avoid re-screenshotting if you already know the layout. Prefer targeted actions (click, find_and_click, hotkey) over repeated screenshots. Use get_cursor_pos to check cursor position without a full screenshot. To open a website, ALWAYS use action="open_url" \u2014 never open_app + new tab, which creates duplicate windows.',
+        description: "Desktop GUI automation \u2014 ONLY for tasks that explicitly require controlling the screen. DO NOT use for coding, research, file edits, or tasks that do not need the desktop UI. DO NOT use alongside browser_open for the same URL \u2014 pick one tool and finish the task in it. wait: pause N seconds for UI/page to load \u2014 use after every navigation or click that triggers a page load. screenshot: only when you cannot proceed without seeing the screen. Max 2 per task. open_url: opens in existing browser tab, never duplicates windows.",
         input_schema: {
           type: "object",
           properties: {
             action: {
               type: "string",
-              description: '"screenshot" | "click" | "double_click" | "right_click" | "move" | "type" | "hotkey" | "scroll" | "drag" | "find_and_click" | "get_screen_size" | "get_cursor_pos" | "open_url" | "open_app"'
+              description: '"screenshot" | "click" | "double_click" | "right_click" | "move" | "type" | "hotkey" | "scroll" | "drag" | "find_and_click" | "get_screen_size" | "get_cursor_pos" | "wait" | "open_url" | "open_app"'
             },
             x: { type: "number", description: "X coordinate (pixels from left)" },
             y: { type: "number", description: "Y coordinate (pixels from top)" },
@@ -2617,6 +2617,7 @@ var init_GUICapability = __esm({
             amount: { type: "number", description: "Scroll clicks (default 3)" },
             app: { type: "string", description: 'App name to open e.g. "Safari", "Terminal", "Chrome"' },
             url: { type: "string", description: 'URL to open e.g. "https://example.com" (use with open_url)' },
+            seconds: { type: "number", description: "Seconds to wait (use with wait action, default 2)" },
             interval: { type: "number", description: "Seconds to wait between actions (default 0.05)" },
             duration: { type: "number", description: "Seconds for mouse movement animation (default 0.2)" }
           },
@@ -2628,7 +2629,7 @@ var init_GUICapability = __esm({
         const start = Date.now();
         const script = this._buildScript(action, input);
         if (!script) {
-          return { success: false, output: `Unknown GUI action: "${action}". Valid: screenshot, click, double_click, right_click, move, type, hotkey, scroll, drag, find_and_click, get_screen_size, get_cursor_pos, open_url, open_app`, duration_ms: 0 };
+          return { success: false, output: `Unknown GUI action: "${action}". Valid: screenshot, click, double_click, right_click, move, type, hotkey, scroll, drag, find_and_click, get_screen_size, get_cursor_pos, wait, open_url, open_app`, duration_ms: 0 };
         }
         if (signal?.aborted) {
           return { success: false, output: "Cancelled.", duration_ms: 0 };
@@ -2718,6 +2719,7 @@ var init_GUICapability = __esm({
         const amount = input.amount != null ? Number(input.amount) : 3;
         const app = input.app != null ? String(input.app) : "";
         const url = input.url != null ? String(input.url) : "";
+        const seconds = input.seconds != null ? Number(input.seconds) : 2;
         const interval = input.interval != null ? Number(input.interval) : 0.05;
         const duration = input.duration != null ? Number(input.duration) : 0.2;
         const header = `
@@ -2737,6 +2739,11 @@ print(f"Screen size: {w} x {h}")
             return header + `
 x, y = pyautogui.position()
 print(f"Cursor position: ({x}, {y})")
+`;
+          case "wait":
+            return header + `
+time.sleep(${seconds})
+print(f"Waited ${seconds}s")
 `;
           case "screenshot": {
             return header + `
@@ -3497,12 +3504,27 @@ content = element.text if element else page.get_all_text()` : `content = page.ge
           `- Use relative paths from the working directory`,
           `- Be concise in your final response: state what was done and where to find it`,
           ``,
-          `GUI Automation (gui_automation tool):`,
-          `- ALWAYS call gui_automation({action:"screenshot"}) first to see what is on screen`,
-          `- Use the OCR output to find element coordinates before clicking`,
-          `- After clicking or typing, take another screenshot to confirm the result`,
+          `\u2550\u2550\u2550 EXECUTION DISCIPLINE \u2014 follow strictly \u2550\u2550\u2550`,
+          `- SEQUENTIAL: complete each step fully before starting the next. Never start step 2 while step 1 is still in progress.`,
+          `- NO DUPLICATION: before any action, review the conversation above. If you already did it (opened a URL, clicked a button, sent a message), DO NOT do it again.`,
+          `- ONE BROWSER ONLY: never use both gui_automation and browser_open for the same task.`,
+          `  \xB7 Use gui_automation (open_url) when the task involves the user's real visible browser.`,
+          `  \xB7 Use browser_open ONLY for silent scraping/content-extraction where no visible browser is needed.`,
+          `  \xB7 Never open the same URL in both. Pick one and finish the task in it.`,
+          `- WAIT FOR LOADS: after every navigation, click, or app open \u2014 wait for the UI to fully load before the next action.`,
+          `  \xB7 Use gui_automation({action:"wait", seconds:2}) after opening URLs or clicking buttons that trigger navigation.`,
+          `  \xB7 Web apps (WhatsApp, Gmail, etc.) need 3\u20135 seconds. Native apps need 1\u20132 seconds.`,
+          `  \xB7 If an action produced no visible change, wait and try once more \u2014 do not spam the same action.`,
+          `\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550`,
+          ``,
+          `GUI Automation (gui_automation tool) \u2014 ONLY use when the task explicitly requires controlling the desktop UI:`,
+          `- DO NOT take screenshots for general tasks, coding, research, or anything that doesn't need the screen`,
+          `- Only screenshot when you genuinely cannot proceed without seeing the current screen state`,
+          `- Prefer find_and_click, hotkey, open_url, and type over repeated screenshots`,
+          `- Max 2 screenshots per task \u2014 if you've already seen the screen, act on that knowledge`,
           `- Use find_and_click to click on text by name rather than guessing coordinates`,
           `- Use hotkey for keyboard shortcuts: "cmd+c", "ctrl+v", "alt+tab", "cmd+space"`,
+          `- To open a website: use open_url \u2014 it reuses the existing browser tab`,
           ...hasMemory ? [
             ``,
             `Memory (CRITICAL \u2014 write EVERYTHING you learn):`,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "0agent",
-  "version": "1.0.55",
+  "version": "1.0.57",
   "description": "A persistent, learning AI agent that runs on your machine. An agent that learns.",
   "private": false,
   "license": "Apache-2.0",