0agent 1.0.55 → 1.0.57

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/daemon.mjs +31 -9
  2. package/package.json +1 -1
package/dist/daemon.mjs CHANGED
@@ -2135,10 +2135,10 @@ var init_BrowserCapability = __esm({
2135
2135
  "use strict";
2136
2136
  BrowserCapability = class {
2137
2137
  name = "browser_open";
2138
- description = "Open a URL in the system browser or extract page content. Use when scrape_url fails on JS-heavy pages.";
2138
+ description = "Headless browser for scraping JS-heavy pages. NOT for user-facing browser automation.";
2139
2139
  toolDefinition = {
2140
2140
  name: "browser_open",
2141
- description: `Open or read a URL using the system browser. Use action="open" to launch the URL visibly in the user's default browser. Use action="read" (default) to extract page content headlessly. Use when scrape_url fails on JS-heavy pages.`,
2141
+ description: `Headless browser \u2014 ONLY for reading/scraping page content when scrape_url fails on JS-heavy pages. action="read" (default): extract text headlessly (invisible, no real browser window opened). NEVER use this when the task involves the user's real browser or visible UI \u2014 use gui_automation with open_url instead. Do NOT use alongside gui_automation for the same URL \u2014 pick one.`,
2142
2142
  input_schema: {
2143
2143
  type: "object",
2144
2144
  properties: {
@@ -2599,13 +2599,13 @@ var init_GUICapability = __esm({
2599
2599
  description = "Automate desktop GUI \u2014 click, type, screenshot, hotkeys, find text on screen.";
2600
2600
  toolDefinition = {
2601
2601
  name: "gui_automation",
2602
- description: 'Automate desktop GUI interactions. Take screenshots to see the current screen state, click on buttons/links/fields, type text, press keyboard shortcuts, scroll, open apps. IMPORTANT: Limit screenshots to at most 3 per task \u2014 avoid re-screenshotting if you already know the layout. Prefer targeted actions (click, find_and_click, hotkey) over repeated screenshots. Use get_cursor_pos to check cursor position without a full screenshot. To open a website, ALWAYS use action="open_url" \u2014 never open_app + new tab, which creates duplicate windows.',
2602
+ description: "Desktop GUI automation \u2014 ONLY for tasks that explicitly require controlling the screen. DO NOT use for coding, research, file edits, or tasks that do not need the desktop UI. DO NOT use alongside browser_open for the same URL \u2014 pick one tool and finish the task in it. wait: pause N seconds for UI/page to load \u2014 use after every navigation or click that triggers a page load. screenshot: only when you cannot proceed without seeing the screen. Max 2 per task. open_url: opens in existing browser tab, never duplicates windows.",
2603
2603
  input_schema: {
2604
2604
  type: "object",
2605
2605
  properties: {
2606
2606
  action: {
2607
2607
  type: "string",
2608
- description: '"screenshot" | "click" | "double_click" | "right_click" | "move" | "type" | "hotkey" | "scroll" | "drag" | "find_and_click" | "get_screen_size" | "get_cursor_pos" | "open_url" | "open_app"'
2608
+ description: '"screenshot" | "click" | "double_click" | "right_click" | "move" | "type" | "hotkey" | "scroll" | "drag" | "find_and_click" | "get_screen_size" | "get_cursor_pos" | "wait" | "open_url" | "open_app"'
2609
2609
  },
2610
2610
  x: { type: "number", description: "X coordinate (pixels from left)" },
2611
2611
  y: { type: "number", description: "Y coordinate (pixels from top)" },
@@ -2617,6 +2617,7 @@ var init_GUICapability = __esm({
2617
2617
  amount: { type: "number", description: "Scroll clicks (default 3)" },
2618
2618
  app: { type: "string", description: 'App name to open e.g. "Safari", "Terminal", "Chrome"' },
2619
2619
  url: { type: "string", description: 'URL to open e.g. "https://example.com" (use with open_url)' },
2620
+ seconds: { type: "number", description: "Seconds to wait (use with wait action, default 2)" },
2620
2621
  interval: { type: "number", description: "Seconds to wait between actions (default 0.05)" },
2621
2622
  duration: { type: "number", description: "Seconds for mouse movement animation (default 0.2)" }
2622
2623
  },
@@ -2628,7 +2629,7 @@ var init_GUICapability = __esm({
2628
2629
  const start = Date.now();
2629
2630
  const script = this._buildScript(action, input);
2630
2631
  if (!script) {
2631
- return { success: false, output: `Unknown GUI action: "${action}". Valid: screenshot, click, double_click, right_click, move, type, hotkey, scroll, drag, find_and_click, get_screen_size, get_cursor_pos, open_url, open_app`, duration_ms: 0 };
2632
+ return { success: false, output: `Unknown GUI action: "${action}". Valid: screenshot, click, double_click, right_click, move, type, hotkey, scroll, drag, find_and_click, get_screen_size, get_cursor_pos, wait, open_url, open_app`, duration_ms: 0 };
2632
2633
  }
2633
2634
  if (signal?.aborted) {
2634
2635
  return { success: false, output: "Cancelled.", duration_ms: 0 };
@@ -2718,6 +2719,7 @@ var init_GUICapability = __esm({
2718
2719
  const amount = input.amount != null ? Number(input.amount) : 3;
2719
2720
  const app = input.app != null ? String(input.app) : "";
2720
2721
  const url = input.url != null ? String(input.url) : "";
2722
+ const seconds = input.seconds != null ? Number(input.seconds) : 2;
2721
2723
  const interval = input.interval != null ? Number(input.interval) : 0.05;
2722
2724
  const duration = input.duration != null ? Number(input.duration) : 0.2;
2723
2725
  const header = `
@@ -2737,6 +2739,11 @@ print(f"Screen size: {w} x {h}")
2737
2739
  return header + `
2738
2740
  x, y = pyautogui.position()
2739
2741
  print(f"Cursor position: ({x}, {y})")
2742
+ `;
2743
+ case "wait":
2744
+ return header + `
2745
+ time.sleep(${seconds})
2746
+ print(f"Waited ${seconds}s")
2740
2747
  `;
2741
2748
  case "screenshot": {
2742
2749
  return header + `
@@ -3497,12 +3504,27 @@ content = element.text if element else page.get_all_text()` : `content = page.ge
3497
3504
  `- Use relative paths from the working directory`,
3498
3505
  `- Be concise in your final response: state what was done and where to find it`,
3499
3506
  ``,
3500
- `GUI Automation (gui_automation tool):`,
3501
- `- ALWAYS call gui_automation({action:"screenshot"}) first to see what is on screen`,
3502
- `- Use the OCR output to find element coordinates before clicking`,
3503
- `- After clicking or typing, take another screenshot to confirm the result`,
3507
+ `\u2550\u2550\u2550 EXECUTION DISCIPLINE \u2014 follow strictly \u2550\u2550\u2550`,
3508
+ `- SEQUENTIAL: complete each step fully before starting the next. Never start step 2 while step 1 is still in progress.`,
3509
+ `- NO DUPLICATION: before any action, review the conversation above. If you already did it (opened a URL, clicked a button, sent a message), DO NOT do it again.`,
3510
+ `- ONE BROWSER ONLY: never use both gui_automation and browser_open for the same task.`,
3511
+ ` \xB7 Use gui_automation (open_url) when the task involves the user's real visible browser.`,
3512
+ ` \xB7 Use browser_open ONLY for silent scraping/content-extraction where no visible browser is needed.`,
3513
+ ` \xB7 Never open the same URL in both. Pick one and finish the task in it.`,
3514
+ `- WAIT FOR LOADS: after every navigation, click, or app open \u2014 wait for the UI to fully load before the next action.`,
3515
+ ` \xB7 Use gui_automation({action:"wait", seconds:2}) after opening URLs or clicking buttons that trigger navigation.`,
3516
+ ` \xB7 Web apps (WhatsApp, Gmail, etc.) need 3\u20135 seconds. Native apps need 1\u20132 seconds.`,
3517
+ ` \xB7 If an action produced no visible change, wait and try once more \u2014 do not spam the same action.`,
3518
+ `\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550`,
3519
+ ``,
3520
+ `GUI Automation (gui_automation tool) \u2014 ONLY use when the task explicitly requires controlling the desktop UI:`,
3521
+ `- DO NOT take screenshots for general tasks, coding, research, or anything that doesn't need the screen`,
3522
+ `- Only screenshot when you genuinely cannot proceed without seeing the current screen state`,
3523
+ `- Prefer find_and_click, hotkey, open_url, and type over repeated screenshots`,
3524
+ `- Max 2 screenshots per task \u2014 if you've already seen the screen, act on that knowledge`,
3504
3525
  `- Use find_and_click to click on text by name rather than guessing coordinates`,
3505
3526
  `- Use hotkey for keyboard shortcuts: "cmd+c", "ctrl+v", "alt+tab", "cmd+space"`,
3527
+ `- To open a website: use open_url \u2014 it reuses the existing browser tab`,
3506
3528
  ...hasMemory ? [
3507
3529
  ``,
3508
3530
  `Memory (CRITICAL \u2014 write EVERYTHING you learn):`,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "0agent",
3
- "version": "1.0.55",
3
+ "version": "1.0.57",
4
4
  "description": "A persistent, learning AI agent that runs on your machine. An agent that learns.",
5
5
  "private": false,
6
6
  "license": "Apache-2.0",