0agent 1.0.69 → 1.0.70

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/daemon.mjs +62 -16
  2. package/package.json +1 -1
package/dist/daemon.mjs CHANGED
@@ -3027,7 +3027,7 @@ var init_GUICapability = __esm({
3027
3027
  properties: {
3028
3028
  action: {
3029
3029
  type: "string",
3030
- description: 'Browser (no Screen Recording): "click_text"|"type_in"|"get_elements"|"read_element"|"get_media_state"|"scroll_to"|"exec_js"|"browser_state"|"cdp_screenshot" | Native apps: "accessibility_click" | Navigation: "open_url"|"open_app" | Mouse/KB (Screen Recording for screenshots): "screenshot"|"click"|"double_click"|"right_click"|"move"|"type"|"hotkey"|"scroll"|"drag"|"find_and_click"|"get_screen_size"|"get_cursor_pos"|"wait"'
3030
+ description: 'Browser (no Screen Recording): "click_text"|"type_in"|"get_elements"|"read_element"|"get_media_state"|"scroll_to"|"exec_js"|"browser_state"|"cdp_screenshot" | Native apps (no Screen Recording): "app_type"|"accessibility_click" | Navigation: "open_url"|"open_app" | Mouse/KB (Screen Recording for screenshots): "screenshot"|"click"|"double_click"|"right_click"|"move"|"type"|"hotkey"|"scroll"|"drag"|"find_and_click"|"get_screen_size"|"get_cursor_pos"|"wait"'
3031
3031
  },
3032
3032
  js: { type: "string", description: `JavaScript to execute in Chrome tab (use with exec_js). Example: "document.querySelector('video').paused"` },
3033
3033
  selector: { type: "string", description: 'CSS selector for read_element, type_in, scroll_to (e.g. "input[type=search]", ".title", "video")' },
@@ -3292,7 +3292,7 @@ print(f"Moved to ({${x}}, {${y}})")
3292
3292
  case "type": {
3293
3293
  if (!text) return null;
3294
3294
  return header + `
3295
- pyautogui.write(${JSON.stringify(text)}, interval=${interval})
3295
+ pyautogui.typewrite(${JSON.stringify(text)}, interval=${interval})
3296
3296
  print("Typed successfully")
3297
3297
  `;
3298
3298
  }
@@ -3587,6 +3587,48 @@ time.sleep(1.5)
3587
3587
  `;
3588
3588
  }
3589
3589
  // ── New high-level browser actions — no Screen Recording needed ───────────
3590
+ case "app_type": {
3591
+ const appName = String(input.app ?? "").trim();
3592
+ const typeText = String(input.text ?? text ?? "").trim();
3593
+ if (!appName || !typeText) return null;
3594
+ const osName = platform2();
3595
+ if (osName !== "darwin") return header + `print("app_type requires macOS")`;
3596
+ const safeApp = appName.replace(/'/g, "\\'");
3597
+ const textJson = JSON.stringify(typeText);
3598
+ return header + `
3599
+ import subprocess, time, json
3600
+
3601
+ text_to_type = json.loads(${textJson})
3602
+
3603
+ # Step 1: copy to clipboard (handles unicode, special chars, long text)
3604
+ cp = subprocess.run(['pbcopy'], input=text_to_type.encode('utf-8'), capture_output=True)
3605
+ if cp.returncode != 0:
3606
+ print(f"Clipboard copy failed: {cp.stderr.decode()[:100]}")
3607
+ sys.exit(1)
3608
+
3609
+ # Step 2: bring app to front
3610
+ subprocess.run(['osascript', '-e', 'tell application "${safeApp}" to activate'], capture_output=True)
3611
+ time.sleep(0.4)
3612
+
3613
+ # Step 3: paste via AppleScript System Events (targets the specific process, not OS focus)
3614
+ paste_script = """tell application "System Events"
3615
+ tell process "${safeApp}"
3616
+ keystroke "v" using command down
3617
+ end tell
3618
+ end tell"""
3619
+ r = subprocess.run(['osascript', '-e', paste_script], capture_output=True, text=True)
3620
+ if r.returncode == 0:
3621
+ print(f"Typed in ${safeApp}: {text_to_type[:60]}")
3622
+ else:
3623
+ # Accessibility permission might be needed
3624
+ err = r.stderr.strip()
3625
+ if 'not allowed' in err.lower() or 'accessibility' in err.lower():
3626
+ subprocess.run(['open', 'x-apple.systempreferences:com.apple.preference.security?Privacy_Accessibility'], capture_output=True)
3627
+ print(f"Accessibility permission needed for ${safeApp}. System Settings opened \u2014 Privacy & Security \u2192 Accessibility \u2192 enable Terminal.")
3628
+ else:
3629
+ print(f"app_type error: {err[:200]}")
3630
+ `;
3631
+ }
3590
3632
  case "click_text": {
3591
3633
  if (!text) return null;
3592
3634
  if (platform2() !== "darwin") return header + `print("click_text requires macOS + Chrome")`;
@@ -4709,20 +4751,24 @@ content = element.text if element else page.get_all_text()` : `content = page.ge
4709
4751
  lines.push(
4710
4752
  ``,
4711
4753
  `Browser/GUI actions \u2014 ALL work without Screen Recording permission:`,
4712
- `\u2022 click_text {text:"Submit"} \u2014 click any button/link/tab by its visible text. Use get_elements first if unsure.`,
4713
- `\u2022 type_in {selector:"search", text:"Drake"} \u2014 fill form field by placeholder or aria-label. Handles React events.`,
4714
- `\u2022 get_elements \u2014 list every button/link/input/heading on the page. ALWAYS call this first when navigating.`,
4715
- `\u2022 read_element {selector:"h1"} \u2014 read text of any element. selector="" reads full page.`,
4716
- `\u2022 get_media_state \u2014 returns {state:PLAYING/PAUSED, time, duration, volume}. Call after play/pause to verify.`,
4717
- `\u2022 scroll_to {selector:".result"} or {direction:"down", amount:400} \u2014 scroll page.`,
4718
- `\u2022 exec_js {js:"..."} \u2014 run arbitrary JS. Use for anything not covered above.`,
4719
- `\u2022 browser_state \u2014 get current URL + title. Call after any navigation.`,
4720
- `\u2022 cdp_screenshot \u2014 screenshot via Chrome DevTools Protocol (if Chrome has --remote-debugging-port=9222).`,
4721
- `\u2022 accessibility_click {app:"WhatsApp", element:"Send"} \u2014 click native macOS app button (no Screen Recording).`,
4722
- `\u2022 open_url {url:"..."} \u2014 navigate. Returns actual URL + title + video state. Read it.`,
4723
- `\u2022 hotkey {keys:"k", app:"Google Chrome"} \u2014 send key to specific app. Without app param: goes to Terminal.`,
4724
- `WORKFLOW: navigate \u2192 get_elements \u2192 click_text/type_in \u2192 get_media_state/browser_state to verify.`,
4725
- `NEVER assume success. Always verify with get_media_state, browser_state, or read_element.`
4754
+ `BROWSER (Chrome): click_text {text} | type_in {selector,text} | get_elements | read_element {selector}`,
4755
+ ` get_media_state | scroll_to {selector|direction} | exec_js {js} | browser_state | cdp_screenshot`,
4756
+ `NATIVE APPS (no Screen Recording \u2014 use these for WhatsApp, iMessage, Finder):`,
4757
+ ` app_type {app:"WhatsApp", text:"hi"} \u2014 types via clipboard paste \u2192 cmd+v into the app.`,
4758
+ ` Uses macOS clipboard so unicode/emoji/special chars all work. Target app gets the text`,
4759
+ ` regardless of OS keyboard focus. ALWAYS use this for native app text input.`,
4760
+ ` accessibility_click {app:"WhatsApp", element:"Send"} \u2014 click button via Accessibility API.`,
4761
+ ` hotkey {keys:"cmd+f", app:"WhatsApp"} \u2014 send hotkey to specific app (not Terminal).`,
4762
+ `WHATSAPP WORKFLOW (use this exact sequence):`,
4763
+ ` 1. open_app {app:"WhatsApp"}`,
4764
+ ` 2. hotkey {keys:"cmd+f", app:"WhatsApp"} \u2014 open search`,
4765
+ ` 3. app_type {app:"WhatsApp", text:"ContactName"} \u2014 search for contact`,
4766
+ ` 4. hotkey {keys:"enter", app:"WhatsApp"} \u2014 open the conversation`,
4767
+ ` 5. app_type {app:"WhatsApp", text:"your message"} \u2014 type message`,
4768
+ ` 6. hotkey {keys:"enter", app:"WhatsApp"} \u2014 send`,
4769
+ ` 7. accessibility_click {app:"WhatsApp", element:"Send"} \u2014 if enter doesn't send`,
4770
+ `NEVER use bare 'type' action for native apps \u2014 it goes to Terminal not the app.`,
4771
+ `ALWAYS verify: browser_state after web nav, get_media_state after play/pause, read_element for page content.`
4726
4772
  );
4727
4773
  }
4728
4774
  if (isSelfMod && this.agentRoot) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "0agent",
3
- "version": "1.0.69",
3
+ "version": "1.0.70",
4
4
  "description": "A persistent, learning AI agent that runs on your machine. An agent that learns.",
5
5
  "private": false,
6
6
  "license": "Apache-2.0",