0agent 1.0.69 → 1.0.71

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/daemon.mjs +111 -24
  2. package/package.json +1 -1
package/dist/daemon.mjs CHANGED
@@ -3027,7 +3027,7 @@ var init_GUICapability = __esm({
3027
3027
  properties: {
3028
3028
  action: {
3029
3029
  type: "string",
3030
- description: 'Browser (no Screen Recording): "click_text"|"type_in"|"get_elements"|"read_element"|"get_media_state"|"scroll_to"|"exec_js"|"browser_state"|"cdp_screenshot" | Native apps: "accessibility_click" | Navigation: "open_url"|"open_app" | Mouse/KB (Screen Recording for screenshots): "screenshot"|"click"|"double_click"|"right_click"|"move"|"type"|"hotkey"|"scroll"|"drag"|"find_and_click"|"get_screen_size"|"get_cursor_pos"|"wait"'
3030
+ description: 'Browser (no Screen Recording): "click_text"|"type_in"|"get_elements"|"read_element"|"get_media_state"|"scroll_to"|"exec_js"|"browser_state"|"cdp_screenshot" | Native apps (no Screen Recording): "app_type"|"accessibility_click" | Navigation: "open_url"|"open_app" | Mouse/KB (Screen Recording for screenshots): "screenshot"|"click"|"double_click"|"right_click"|"move"|"type"|"hotkey"|"scroll"|"drag"|"find_and_click"|"get_screen_size"|"get_cursor_pos"|"wait"'
3031
3031
  },
3032
3032
  js: { type: "string", description: `JavaScript to execute in Chrome tab (use with exec_js). Example: "document.querySelector('video').paused"` },
3033
3033
  selector: { type: "string", description: 'CSS selector for read_element, type_in, scroll_to (e.g. "input[type=search]", ".title", "video")' },
@@ -3292,7 +3292,7 @@ print(f"Moved to ({${x}}, {${y}})")
3292
3292
  case "type": {
3293
3293
  if (!text) return null;
3294
3294
  return header + `
3295
- pyautogui.write(${JSON.stringify(text)}, interval=${interval})
3295
+ pyautogui.typewrite(${JSON.stringify(text)}, interval=${interval})
3296
3296
  print("Typed successfully")
3297
3297
  `;
3298
3298
  }
@@ -3303,24 +3303,58 @@ print("Typed successfully")
3303
3303
  const pyKeys = JSON.stringify(parts);
3304
3304
  if (targetApp && platform2() === "darwin") {
3305
3305
  const safeApp = targetApp.replace(/'/g, "\\'");
3306
- const asKey = parts[parts.length - 1] ?? "";
3307
- const modifiers = parts.slice(0, -1).map((k) => {
3306
+ const mainKey = parts[parts.length - 1] ?? "";
3307
+ const modParts = parts.slice(0, -1);
3308
+ const KEY_CODES = {
3309
+ down: 125,
3310
+ up: 126,
3311
+ left: 123,
3312
+ right: 124,
3313
+ enter: 36,
3314
+ return: 36,
3315
+ escape: 53,
3316
+ esc: 53,
3317
+ tab: 48,
3318
+ delete: 51,
3319
+ backspace: 51,
3320
+ "delete-forward": 117,
3321
+ space: 49,
3322
+ home: 115,
3323
+ end: 119,
3324
+ pageup: 116,
3325
+ pagedown: 121,
3326
+ f1: 122,
3327
+ f2: 120,
3328
+ f3: 99,
3329
+ f4: 118,
3330
+ f5: 96,
3331
+ f6: 97,
3332
+ f7: 98,
3333
+ f8: 100,
3334
+ f9: 101,
3335
+ f10: 109,
3336
+ f11: 103,
3337
+ f12: 111
3338
+ };
3339
+ const keyCode = KEY_CODES[mainKey];
3340
+ const modifiers = modParts.map((k) => {
3308
3341
  if (k === "command") return "command down";
3309
3342
  if (k === "ctrl") return "control down";
3310
3343
  if (k === "shift") return "shift down";
3311
3344
  if (k === "option") return "option down";
3312
3345
  return "";
3313
3346
  }).filter(Boolean).join(", ");
3314
- const asModStr = modifiers ? ` using {${modifiers}}` : "";
3347
+ const usingClause = modifiers ? ` using {${modifiers}}` : "";
3348
+ const keyStatement = keyCode !== void 0 ? `key code ${keyCode}${usingClause}` : `keystroke "${mainKey}"${usingClause}`;
3315
3349
  return header + `
3316
3350
  import subprocess, time
3317
- # Focus target app first
3318
3351
  subprocess.run(['osascript', '-e', 'tell application "${safeApp}" to activate'], capture_output=True)
3319
3352
  time.sleep(0.3)
3320
- # Send keystroke via AppleScript (reliable \u2014 goes to the focused app, not Terminal)
3321
- as_script = '''tell application "System Events"
3322
- keystroke "${asKey}"${asModStr}
3323
- end tell'''
3353
+ as_script = """tell application "System Events"
3354
+ tell process "${safeApp}"
3355
+ ${keyStatement}
3356
+ end tell
3357
+ end tell"""
3324
3358
  r = subprocess.run(['osascript', '-e', as_script], capture_output=True, text=True)
3325
3359
  if r.returncode == 0:
3326
3360
  print(f"Sent ${parts.join("+")} to ${safeApp}")
@@ -3587,6 +3621,48 @@ time.sleep(1.5)
3587
3621
  `;
3588
3622
  }
3589
3623
  // ── New high-level browser actions — no Screen Recording needed ───────────
3624
+ case "app_type": {
3625
+ const appName = String(input.app ?? "").trim();
3626
+ const typeText = String(input.text ?? text ?? "").trim();
3627
+ if (!appName || !typeText) return null;
3628
+ const osName = platform2();
3629
+ if (osName !== "darwin") return header + `print("app_type requires macOS")`;
3630
+ const safeApp = appName.replace(/'/g, "\\'");
3631
+ const textJson = JSON.stringify(typeText);
3632
+ return header + `
3633
+ import subprocess, time, json
3634
+
3635
+ text_to_type = json.loads(${textJson})
3636
+
3637
+ # Step 1: copy to clipboard (handles unicode, special chars, long text)
3638
+ cp = subprocess.run(['pbcopy'], input=text_to_type.encode('utf-8'), capture_output=True)
3639
+ if cp.returncode != 0:
3640
+ print(f"Clipboard copy failed: {cp.stderr.decode()[:100]}")
3641
+ sys.exit(1)
3642
+
3643
+ # Step 2: bring app to front
3644
+ subprocess.run(['osascript', '-e', 'tell application "${safeApp}" to activate'], capture_output=True)
3645
+ time.sleep(0.4)
3646
+
3647
+ # Step 3: paste via AppleScript System Events (targets the specific process, not OS focus)
3648
+ paste_script = """tell application "System Events"
3649
+ tell process "${safeApp}"
3650
+ keystroke "v" using command down
3651
+ end tell
3652
+ end tell"""
3653
+ r = subprocess.run(['osascript', '-e', paste_script], capture_output=True, text=True)
3654
+ if r.returncode == 0:
3655
+ print(f"Typed in ${safeApp}: {text_to_type[:60]}")
3656
+ else:
3657
+ # Accessibility permission might be needed
3658
+ err = r.stderr.strip()
3659
+ if 'not allowed' in err.lower() or 'accessibility' in err.lower():
3660
+ subprocess.run(['open', 'x-apple.systempreferences:com.apple.preference.security?Privacy_Accessibility'], capture_output=True)
3661
+ print(f"Accessibility permission needed for ${safeApp}. System Settings opened \u2014 Privacy & Security \u2192 Accessibility \u2192 enable Terminal.")
3662
+ else:
3663
+ print(f"app_type error: {err[:200]}")
3664
+ `;
3665
+ }
3590
3666
  case "click_text": {
3591
3667
  if (!text) return null;
3592
3668
  if (platform2() !== "darwin") return header + `print("click_text requires macOS + Chrome")`;
@@ -4709,20 +4785,31 @@ content = element.text if element else page.get_all_text()` : `content = page.ge
4709
4785
  lines.push(
4710
4786
  ``,
4711
4787
  `Browser/GUI actions \u2014 ALL work without Screen Recording permission:`,
4712
- `\u2022 click_text {text:"Submit"} \u2014 click any button/link/tab by its visible text. Use get_elements first if unsure.`,
4713
- `\u2022 type_in {selector:"search", text:"Drake"} \u2014 fill form field by placeholder or aria-label. Handles React events.`,
4714
- `\u2022 get_elements \u2014 list every button/link/input/heading on the page. ALWAYS call this first when navigating.`,
4715
- `\u2022 read_element {selector:"h1"} \u2014 read text of any element. selector="" reads full page.`,
4716
- `\u2022 get_media_state \u2014 returns {state:PLAYING/PAUSED, time, duration, volume}. Call after play/pause to verify.`,
4717
- `\u2022 scroll_to {selector:".result"} or {direction:"down", amount:400} \u2014 scroll page.`,
4718
- `\u2022 exec_js {js:"..."} \u2014 run arbitrary JS. Use for anything not covered above.`,
4719
- `\u2022 browser_state \u2014 get current URL + title. Call after any navigation.`,
4720
- `\u2022 cdp_screenshot \u2014 screenshot via Chrome DevTools Protocol (if Chrome has --remote-debugging-port=9222).`,
4721
- `\u2022 accessibility_click {app:"WhatsApp", element:"Send"} \u2014 click native macOS app button (no Screen Recording).`,
4722
- `\u2022 open_url {url:"..."} \u2014 navigate. Returns actual URL + title + video state. Read it.`,
4723
- `\u2022 hotkey {keys:"k", app:"Google Chrome"} \u2014 send key to specific app. Without app param: goes to Terminal.`,
4724
- `WORKFLOW: navigate \u2192 get_elements \u2192 click_text/type_in \u2192 get_media_state/browser_state to verify.`,
4725
- `NEVER assume success. Always verify with get_media_state, browser_state, or read_element.`
4788
+ `BROWSER (Chrome): click_text {text} | type_in {selector,text} | get_elements | read_element {selector}`,
4789
+ ` get_media_state | scroll_to {selector|direction} | exec_js {js} | browser_state | cdp_screenshot`,
4790
+ `NATIVE APPS (no Screen Recording \u2014 use these for WhatsApp, iMessage, Finder):`,
4791
+ ` app_type {app:"WhatsApp", text:"hi"} \u2014 types via clipboard paste \u2192 cmd+v into the app.`,
4792
+ ` Uses macOS clipboard so unicode/emoji/special chars all work. Target app gets the text`,
4793
+ ` regardless of OS keyboard focus. ALWAYS use this for native app text input.`,
4794
+ ` accessibility_click {app:"WhatsApp", element:"Send"} \u2014 click button via Accessibility API.`,
4795
+ ` hotkey {keys:"cmd+f", app:"WhatsApp"} \u2014 send hotkey to specific app (not Terminal).`,
4796
+ `NATIVE APP SEARCH NAVIGATION (generalised \u2014 works for WhatsApp, Spotlight, file dialogs, any search):`,
4797
+ ` After typing in a search box, results appear in a dropdown/list.`,
4798
+ ` Navigate them with: hotkey {keys:"down", app:"AppName"} to move to first result,`,
4799
+ ` then hotkey {keys:"enter", app:"AppName"} to select/open it.`,
4800
+ ` If first down+enter doesn't work, try: down, down, enter (some apps skip a header row).`,
4801
+ ` Use wait {seconds:1} after typing to give results time to load before navigating.`,
4802
+ `WHATSAPP EXACT WORKFLOW:`,
4803
+ ` 1. open_app {app:"WhatsApp"}`,
4804
+ ` 2. hotkey {keys:"cmd+f", app:"WhatsApp"} \u2014 open search bar`,
4805
+ ` 3. app_type {app:"WhatsApp", text:"ContactName"} \u2014 type contact name`,
4806
+ ` 4. wait {seconds:1} \u2014 wait for search results to load`,
4807
+ ` 5. hotkey {keys:"down", app:"WhatsApp"} \u2014 move focus to first result`,
4808
+ ` 6. hotkey {keys:"enter", app:"WhatsApp"} \u2014 open the conversation`,
4809
+ ` 7. app_type {app:"WhatsApp", text:"your message"} \u2014 type message`,
4810
+ ` 8. hotkey {keys:"enter", app:"WhatsApp"} \u2014 send`,
4811
+ `NEVER use bare 'type' action for native apps \u2014 it goes to Terminal not the app.`,
4812
+ `ALWAYS verify: browser_state after web nav, get_media_state after play/pause.`
4726
4813
  );
4727
4814
  }
4728
4815
  if (isSelfMod && this.agentRoot) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "0agent",
3
- "version": "1.0.69",
3
+ "version": "1.0.71",
4
4
  "description": "A persistent, learning AI agent that runs on your machine. An agent that learns.",
5
5
  "private": false,
6
6
  "license": "Apache-2.0",