visionclaw 0.1.187-beta.8 → 0.1.187-dev.refactor-computer-use-direct-coordinates.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/dist/agent/loop.js +1 -1
  2. package/dist/agent/loop.js.map +1 -1
  3. package/dist/agent/providers/client-factory.d.ts +1 -1
  4. package/dist/agent/providers/client-factory.js +1 -1
  5. package/dist/agent/runtime-surface.d.ts +1 -1
  6. package/dist/agent/runtime-surface.d.ts.map +1 -1
  7. package/dist/agent/runtime-surface.js +35 -18
  8. package/dist/agent/runtime-surface.js.map +1 -1
  9. package/dist/agent/system-prompt.d.ts.map +1 -1
  10. package/dist/agent/system-prompt.js +1 -3
  11. package/dist/agent/system-prompt.js.map +1 -1
  12. package/dist/builtin-skills/macos-automation/SKILL.md +13 -10
  13. package/dist/onboarding/generate-wallpaper.d.ts +3 -8
  14. package/dist/onboarding/generate-wallpaper.d.ts.map +1 -1
  15. package/dist/onboarding/generate-wallpaper.js +3 -123
  16. package/dist/onboarding/generate-wallpaper.js.map +1 -1
  17. package/dist/tools/computer-use.d.ts +56 -6
  18. package/dist/tools/computer-use.d.ts.map +1 -1
  19. package/dist/tools/computer-use.js +129 -286
  20. package/dist/tools/computer-use.js.map +1 -1
  21. package/dist-agent/bundle.cjs +208 -574
  22. package/package.json +1 -1
  23. package/dist/agent/applied-credential-signature.d.ts +0 -53
  24. package/dist/agent/applied-credential-signature.d.ts.map +0 -1
  25. package/dist/agent/applied-credential-signature.js +0 -137
  26. package/dist/agent/applied-credential-signature.js.map +0 -1
  27. package/dist/agent/tunnel-credential-handler.d.ts +0 -90
  28. package/dist/agent/tunnel-credential-handler.d.ts.map +0 -1
  29. package/dist/agent/tunnel-credential-handler.js +0 -162
  30. package/dist/agent/tunnel-credential-handler.js.map +0 -1
  31. package/dist/billing/payg-handler.d.ts +0 -29
  32. package/dist/billing/payg-handler.d.ts.map +0 -1
  33. package/dist/billing/payg-handler.js +0 -92
  34. package/dist/billing/payg-handler.js.map +0 -1
  35. package/dist/billing/payment-handler.d.ts +0 -24
  36. package/dist/billing/payment-handler.d.ts.map +0 -1
  37. package/dist/billing/payment-handler.js +0 -101
  38. package/dist/billing/payment-handler.js.map +0 -1
  39. package/dist/builtin-skills/catalog/phone-adb-automation/SKILL.md +0 -412
  40. package/dist/builtin-skills/catalog/phone-adb-automation/phone_input.sh +0 -132
  41. package/dist/builtin-skills/catalog/phone-adb-automation/phone_launch.sh +0 -166
  42. package/dist/builtin-skills/catalog/phone-adb-automation/phone_screenshot.sh +0 -87
  43. package/dist/builtin-skills/catalog/phone-adb-automation/phone_security_kbd.py +0 -174
  44. package/dist/builtin-skills/catalog/phone-adb-automation/phone_setup.sh +0 -274
  45. package/dist/builtin-skills/catalog/phone-adb-automation/phone_swipe.sh +0 -111
  46. package/dist/builtin-skills/catalog/phone-adb-automation/phone_tap.sh +0 -87
  47. package/dist/builtin-skills/catalog/phone-adb-automation/phone_ui_parse.py +0 -176
  48. package/dist/builtin-skills/catalog/phone-adb-automation/phone_wake_unlock.sh +0 -67
  49. package/dist/builtin-skills/transcribe-audio/SKILL.md +0 -122
  50. package/dist/data-processing/convert-demo-cli.d.ts +0 -7
  51. package/dist/data-processing/convert-demo-cli.d.ts.map +0 -1
  52. package/dist/data-processing/convert-demo-cli.js +0 -30
  53. package/dist/data-processing/convert-demo-cli.js.map +0 -1
  54. package/dist/data-processing/convert-demo.d.ts +0 -26
  55. package/dist/data-processing/convert-demo.d.ts.map +0 -1
  56. package/dist/data-processing/convert-demo.js +0 -233
  57. package/dist/data-processing/convert-demo.js.map +0 -1
  58. package/dist/obs/rdp/icons/icons/app_windows.svg +0 -4
  59. package/dist/obs/rdp/icons/icons/clip_get.svg +0 -4
  60. package/dist/obs/rdp/icons/icons/clip_send.svg +0 -4
  61. package/dist/obs/rdp/icons/icons/clip_shared.svg +0 -4
  62. package/dist/obs/rdp/icons/icons/clipboard.svg +0 -4
  63. package/dist/obs/rdp/icons/icons/clipboard_shared.svg +0 -4
  64. package/dist/obs/rdp/icons/icons/control.svg +0 -4
  65. package/dist/obs/rdp/icons/icons/desktop.svg +0 -4
  66. package/dist/obs/rdp/icons/icons/display.svg +0 -4
  67. package/dist/obs/rdp/icons/icons/launchpad.svg +0 -4
  68. package/dist/obs/rdp/icons/icons/mission_control.svg +0 -4
  69. package/dist/obs/rdp/icons/icons/screenshot.svg +0 -4
  70. package/dist/obs/rdp/icons/icons/zoom_actual.svg +0 -4
  71. package/dist/obs/rdp/icons/icons/zoom_fit.svg +0 -4
  72. package/dist/obs/rdp/icons/icons/zoom_in.svg +0 -4
  73. package/dist/obs/rdp/icons/icons/zoom_out.svg +0 -4
  74. package/dist/obs/tunnel-telemetry.d.ts +0 -46
  75. package/dist/obs/tunnel-telemetry.d.ts.map +0 -1
  76. package/dist/obs/tunnel-telemetry.js +0 -70
  77. package/dist/obs/tunnel-telemetry.js.map +0 -1
  78. package/dist/service/gbox-tun.d.ts +0 -14
  79. package/dist/service/gbox-tun.d.ts.map +0 -1
  80. package/dist/service/gbox-tun.js +0 -315
  81. package/dist/service/gbox-tun.js.map +0 -1
  82. package/dist/tools/coordinate-resolver.d.ts +0 -30
  83. package/dist/tools/coordinate-resolver.d.ts.map +0 -1
  84. package/dist/tools/coordinate-resolver.js +0 -104
  85. package/dist/tools/coordinate-resolver.js.map +0 -1
  86. package/dist/utils/wechat-monitor.d.ts +0 -21
  87. package/dist/utils/wechat-monitor.d.ts.map +0 -1
  88. package/dist/utils/wechat-monitor.js +0 -88
  89. package/dist/utils/wechat-monitor.js.map +0 -1
@@ -1,101 +0,0 @@
1
- /**
2
- * Payment command handler for the `payment_required` heartbeat command.
3
- *
4
- * This is deterministic bot code u2014 the LLM agent is never involved in
5
- * payment flows. The handler:
6
- *
7
- * 1. Receives `payment_required` from the backend via heartbeat (includes plans)
8
- * 2. Sends a plan selection prompt (inline keyboard) to the owner via Telegram
9
- * 3. Listens for `billing_plan_selected` events from the Telegram adapter
10
- * 4. Calls the backend to create a Stripe Checkout Session
11
- * 5. Sends the checkout URL back to the owner as a URL button
12
- */
13
- import { logger } from "../logger.js";
14
- const MAX_CHECKOUT_RETRIES = 2;
15
- const RETRY_DELAY_MS = 2_000;
16
- function sleep(ms) {
17
- return new Promise((resolve) => setTimeout(resolve, ms));
18
- }
19
- /**
20
- * Create a CommandHandler for the "payment_required" heartbeat command.
21
- *
22
- * @param channelManager - To send payment prompts and listen for billing events
23
- * @param ownerChatId - The owner's Telegram chat ID
24
- * @param serviceClient - Optional onboarding service client for creating checkout sessions
25
- */
26
- export function createPaymentHandler(channelManager, ownerChatId, serviceClient) {
27
- // Wire up plan selection listener eagerly so it is active regardless of
28
- // whether a `payment_required` command is dispatched (the subscription gate
29
- // sends the plan picker directly without a command).
30
- channelManager.on("billing_plan_selected", (event) => {
31
- void handlePlanSelected(event);
32
- });
33
- async function handlePlanSelected(event) {
34
- if (!serviceClient) {
35
- logger.warn("[billing] No service client u2014 cannot create checkout session");
36
- await channelManager.sendMessage("telegram", String(event.chatId), "Payment is not configured. Please contact support.");
37
- return;
38
- }
39
- logger.system(`[billing] Plan selected: ${event.planId} by chat ${event.chatId}`);
40
- // Retry loop for creating checkout session
41
- for (let attempt = 0; attempt <= MAX_CHECKOUT_RETRIES; attempt++) {
42
- try {
43
- const result = await serviceClient.createCheckoutSession(event.planId, event.chatId);
44
- if (result?.checkoutUrl) {
45
- await channelManager.sendTelegramCheckoutLink(event.chatId, event.planName, result.checkoutUrl);
46
- logger.system(`[billing] Checkout URL sent to chat ${event.chatId} for plan ${event.planId}`);
47
- return;
48
- }
49
- // null result u2014 backend call failed
50
- if (attempt < MAX_CHECKOUT_RETRIES) {
51
- logger.warn(`[billing] Checkout session attempt ${attempt + 1} failed, retrying...`);
52
- await channelManager.sendMessage("telegram", String(event.chatId), "Payment setup failed, retrying...");
53
- await sleep(RETRY_DELAY_MS);
54
- }
55
- }
56
- catch (err) {
57
- const errMsg = err instanceof Error ? err.message : String(err);
58
- logger.err(`[billing] Checkout session attempt ${attempt + 1} error: ${errMsg}`);
59
- if (attempt < MAX_CHECKOUT_RETRIES) {
60
- await sleep(RETRY_DELAY_MS);
61
- }
62
- }
63
- }
64
- // All retries exhausted
65
- logger.err("[billing] All checkout session attempts failed");
66
- await channelManager.sendMessage("telegram", String(event.chatId), "Payment setup failed. Please try again later.");
67
- }
68
- return async (_commandId, payload) => {
69
- // Backend just signals "payment needed" with plans
70
- const raw = (payload ?? {});
71
- const plans = Array.isArray(raw.plans) ? raw.plans : [];
72
- if (plans.length === 0) {
73
- logger.warn("[billing] payment_required received with no plans");
74
- return {
75
- status: "failed",
76
- result: JSON.stringify({ error: "No plans in payload" }),
77
- };
78
- }
79
- const typedPayload = {
80
- message: typeof raw.message === "string" ? raw.message : undefined,
81
- plans,
82
- };
83
- logger.system(`[billing] Payment required received with ${typedPayload.plans.length} plan(s)`);
84
- try {
85
- await channelManager.sendTelegramPaymentPrompt(ownerChatId, typedPayload);
86
- return {
87
- status: "completed",
88
- result: JSON.stringify({ message: "Payment prompt sent to owner" }),
89
- };
90
- }
91
- catch (err) {
92
- const errMsg = err instanceof Error ? err.message : String(err);
93
- logger.err(`[billing] Failed to send payment prompt: ${errMsg}`);
94
- return {
95
- status: "failed",
96
- result: JSON.stringify({ error: errMsg }),
97
- };
98
- }
99
- };
100
- }
101
- //# sourceMappingURL=payment-handler.js.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"payment-handler.js","sourceRoot":"","sources":["../../src/billing/payment-handler.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAOH,OAAO,EAAE,MAAM,EAAE,MAAM,cAAc,CAAC;AAEtC,MAAM,oBAAoB,GAAG,CAAC,CAAC;AAC/B,MAAM,cAAc,GAAG,KAAK,CAAC;AAE7B,SAAS,KAAK,CAAC,EAAU;IACvB,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC,CAAC;AAC3D,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,oBAAoB,CAClC,cAA8B,EAC9B,WAAmB,EACnB,aAA6C;IAE7C,wEAAwE;IACxE,4EAA4E;IAC5E,qDAAqD;IACrD,cAAc,CAAC,EAAE,CAAC,uBAAuB,EAAE,CAAC,KAA+B,EAAE,EAAE;QAC7E,KAAK,kBAAkB,CAAC,KAAK,CAAC,CAAC;IACjC,CAAC,CAAC,CAAC;IAEH,KAAK,UAAU,kBAAkB,CAAC,KAA+B;QAC/D,IAAI,CAAC,aAAa,EAAE,CAAC;YACnB,MAAM,CAAC,IAAI,CAAC,kEAAkE,CAAC,CAAC;YAChF,MAAM,cAAc,CAAC,WAAW,CAC9B,UAAU,EACV,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,EACpB,oDAAoD,CACrD,CAAC;YACF,OAAO;QACT,CAAC;QAED,MAAM,CAAC,MAAM,CAAC,4BAA4B,KAAK,CAAC,MAAM,YAAY,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC;QAElF,2CAA2C;QAC3C,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,oBAAoB,EAAE,OAAO,EAAE,EAAE,CAAC;YACjE,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,qBAAqB,CACtD,KAAK,CAAC,MAAM,EACZ,KAAK,CAAC,MAAM,CACb,CAAC;gBAEF,IAAI,MAAM,EAAE,WAAW,EAAE,CAAC;oBACxB,MAAM,cAAc,CAAC,wBAAwB,CAC3C,KAAK,CAAC,MAAM,EACZ,KAAK,CAAC,QAAQ,EACd,MAAM,CAAC,WAAW,CACnB,CAAC;oBACF,MAAM,CAAC,MAAM,CACX,uCAAuC,KAAK,CAAC,MAAM,aAAa,KAAK,CAAC,MAAM,EAAE,CAC/E,CAAC;oBACF,OAAO;gBACT,CAAC;gBAED,wCAAwC;gBACxC,IAAI,OAAO,GAAG,oBAAoB,EAAE,CAAC;oBACnC,MAAM,CAAC,IAAI,CAAC,sCAAsC,OAAO,GAAG,CAAC,sBAAsB,CAAC,CAAC;oBACrF,MAAM,cAAc,CAAC,WAAW,CAC9B,UAAU,EACV,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,EACpB,mCAAmC,CACpC,CAAC;oBACF,MAAM,KAAK,CAAC,cAAc,CAAC,CAAC;gBAC9B,CAAC;YACH,CAAC;YAAC,OAAO,GAAG,EAAE,CAAC;gBACb,MAAM,MAAM,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;gBAChE,MAAM,CAAC,GAAG,CAAC,sCAAsC,OAAO,GAAG,CAAC,WAAW,MAAM,EAAE,CAAC,CAAC;gBACjF,IAAI,OAAO,GAAG,oBAAoB,EAAE,CAAC;oBACnC,MAAM,KAAK,CAAC,cAAc,CAAC,CAAC;gBAC9B,CAAC;YACH,CAAC;QACH,CAAC;QAED,wBAAwB;QACxB,MAAM,CAAC,GAAG,CAAC,gDAAgD,CAAC,CAAC;QAC7D,MAAM,cAAc,CAAC,WAAW,CAC9B,UAAU,EACV,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,EACpB,+CAA+C,CAChD,CAAC;IACJ,CAAC;IAED,OAAO,KAAK,EAAE,UAAkB,EAAE,OAAgB,EAAE,EAAE;QACpD,mDAAmD;QACnD,MAAM,GAAG,GAAG,CAAC,OAAO,IAAI,EAAE,CAA4B,CAAC;QACvD,MAAM,KAAK,GAAG,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,KAAwC,CAAC,CAAC,CAAC,EAAE,CAAC;QAE3F,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,MAAM,CAAC,IAAI,CAAC,mDAAmD,CAAC,CAAC;YACjE,OAAO;gBACL,MAAM,EAAE,QAAiB;gBACzB,MAAM,EAAE,IAAI,CAAC,SAAS,CAAC,EAAE,KAAK,EAAE,qBAAqB,EAAE,CAAC;aACzD,CAAC;QACJ,CAAC;QAED,MAAM,YAAY,GAA2B;YAC3C,OAAO,EAAE,OAAO,GAAG,CAAC,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS;YAClE,KAAK;SACN,CAAC;QAEF,MAAM,CAAC,MAAM,CACX,4CAA4C,YAAY,CAAC,KAAK,CAAC,MAAM,UAAU,CAChF,CAAC;QAEF,IAAI,CAAC;YACH,MAAM,cAAc,CAAC,yBAAyB,CAAC,WAAW,EAAE,YAAY,CAAC,CAAC;YAC1E,OAAO;gBACL,MAAM,EAAE,WAAoB;gBAC5B,MAAM,EAAE,IAAI,CAAC,SAAS,CAAC,EAAE,OAAO,EAAE,8BAA8B,EAAE,CAAC;aACpE,CAAC;QACJ,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,MAAM,MAAM,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YAChE,MAAM,CAAC,GAAG,CAAC,4CAA4C,MAAM,EAAE,CAAC,CAAC;YACjE,OAAO;gBACL,MAAM,EAAE,QAAiB;gBACzB,MAAM,EAAE,IAAI,CAAC,SAAS,CAAC,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC;aAC1C,CAAC;QACJ,CAAC;IACH,CAAC,CAAC;AACJ,CAAC"}
@@ -1,412 +0,0 @@
1
- ---
2
- description: Use this skill for controlling a connected Android phone via ADB -- taking screenshots, navigating apps, tapping, swiping, typing, and other phone automation tasks. Covers WeChat, WeCom, and other installed Chinese apps. Includes coordinate scaling, Chinese input workarounds, and security keyboard handling.
3
- ---
4
-
5
- # Android Phone ADB Automation
6
-
7
- Complete guide for remotely controlling an Android phone connected via USB ADB. Covers screenshot workflow, touch interactions, app launching, text input limitations, and common troubleshooting.
8
-
9
- ## Helper Scripts (Ready to Use)
10
-
11
- This skill includes ready-to-use scripts that agents can call directly. All scripts are installed alongside this SKILL.md file. Reference them using the skill directory path:
12
-
13
- | Script | Purpose | Key Options |
14
- |--------|---------|-------------|
15
- | `phone_setup.sh` | **Run first!** Install ADBKeyboard, verify device | `--status`, `--uninstall`, `--skip-kbd` |
16
- | `phone_screenshot.sh` | Take, pull & scale screenshot | `-s SCALE%`, `-o PATH`, `-r` (raw) |
17
- | `phone_tap.sh` | Tap with auto coordinate scaling | `-S SCALE%`, `-l` (long press) |
18
- | `phone_swipe.sh` | Swipe with presets (up/down/left/right) | preset or `x1 y1 x2 y2`, `-S SCALE%` |
19
- | `phone_wake_unlock.sh` | Wake screen & swipe to unlock | `-d DEVICE` |
20
- | `phone_launch.sh` | Launch app by alias or package | `-l` (list aliases) |
21
- | `phone_input.sh` | Type text with method options | `-m direct\|adbkbd\|clipboard`, `-e` (enter), `-c` (clear) |
22
- | `phone_ui_parse.py` | Dump & parse UI hierarchy | `-c` (clickable), `-s SEARCH`, `-j` (JSON) |
23
- | `phone_security_kbd.py` | Type on randomized security keyboard | `--dry-run`, `--delay MS` |
24
-
25
- All shell scripts support `-d DEVICE_ID` for multi-device setups and `-q` for quiet mode.
26
-
27
- ### First-Time Setup
28
-
29
- Before using this skill for the first time, run the setup script to install **ADBKeyboard** (required for Chinese text input) and verify the device connection:
30
-
31
- ```bash
32
- SKILL_DIR="<path-to-installed-skill>"
33
- bash "$SKILL_DIR/phone_setup.sh" # Full setup: checks device, installs ADBKeyboard
34
- bash "$SKILL_DIR/phone_setup.sh" --status # Check if everything is configured
35
- ```
36
-
37
- > **Note**: If you skip setup, `phone_input.sh -m adbkbd` will auto-detect the missing ADBKeyboard and run setup automatically on first use.
38
-
39
- ### Quick Example: Screenshot → Analyze → Tap
40
-
41
- ```bash
42
- SKILL_DIR="$(dirname "$0")" # or use the installed skill path
43
-
44
- # Take scaled screenshot
45
- bash "$SKILL_DIR/phone_screenshot.sh" -s 50 -o /tmp/phone_screen.png
46
-
47
- # (Agent analyzes /tmp/phone_screen.png, identifies target at scaled coords 305,664)
48
-
49
- # Tap with auto-scaling from 50% screenshot coords to native
50
- bash "$SKILL_DIR/phone_tap.sh" 305 664 -S 50
51
- ```
52
-
53
- ## Pre-flight Check
54
-
55
- Every time before operating the phone, verify the device connection:
56
-
57
- ```bash
58
- adb devices
59
- # Expected: <device_id> device
60
- # If "unauthorized": user must tap "Allow USB debugging" on phone
61
- # If "offline" or missing: check USB cable connection
62
- ```
63
-
64
- If adb is quarantined by macOS:
65
- ```bash
66
- sudo xattr -rd com.apple.quarantine $(which adb)
67
- ```
68
-
69
- Get device info (resolution, model, Android version):
70
- ```bash
71
- adb shell wm size # e.g. Physical size: 1220x2656
72
- adb shell getprop ro.product.model
73
- adb shell getprop ro.build.version.sdk
74
- ```
75
-
76
- ## Screenshot Workflow (Core Loop)
77
-
78
- Every phone operation follows this cycle:
79
-
80
- ```
81
- Screenshot → Analyze UI → Calculate coordinates → Execute action → Wait → Screenshot to confirm
82
- ```
83
-
84
- ### Taking Screenshots
85
-
86
- **Preferred: Use the helper script** which handles capture, pull, scale, and cleanup:
87
- ```bash
88
- bash phone_screenshot.sh # Default: 50% scale to /tmp/phone_screen.png
89
- bash phone_screenshot.sh -o /tmp/my.png -s 40 # 40% scale, custom output path
90
- bash phone_screenshot.sh -r # Raw native resolution (no scaling)
91
- ```
92
-
93
- Manual approach (if needed):
94
- ```bash
95
- # 1. Capture on device
96
- adb shell screencap -p /sdcard/screen.png
97
-
98
- # 2. Pull to local
99
- adb pull /sdcard/screen.png /tmp/phone_raw.png
100
-
101
- # 3. Scale down (REQUIRED — native resolution often exceeds image processing limits)
102
- # For 1220x2656 device, scale to 50%:
103
- sips --resampleHeightWidth 1328 610 /tmp/phone_raw.png --out /tmp/phone_screen.png
104
-
105
- # 4. Clean up device temp file
106
- adb shell rm /sdcard/screen.png
107
- ```
108
-
109
- > **IMPORTANT**: Always scale screenshots before analysis. Native resolutions (2656px+) exceed the 2000px processing limit. Use 50% scaling as default.
110
-
111
- ### Coordinate Scaling (Critical!)
112
-
113
- When the screenshot is scaled, you must convert coordinates back to native resolution before sending ADB commands:
114
-
115
- ```
116
- Native ADB coordinate = Scaled screenshot coordinate × (Native resolution / Scaled resolution)
117
- ```
118
-
119
- **Example** (50% scale): Screenshot shows target at (305, 664) → ADB tap coordinate = (610, 1328)
120
-
121
- > Always double-check: **read coordinates from scaled image, execute with native coordinates.**
122
-
123
- ## Basic Operations
124
-
125
- ### Wake & Unlock Screen
126
-
127
- ```bash
128
- # Check screen state
129
- adb shell dumpsys power | grep "mWakefulness"
130
- # Awake = screen on, Asleep = screen off
131
-
132
- # Wake up
133
- adb shell input keyevent KEYCODE_WAKEUP
134
-
135
- # Swipe up to unlock (from bottom to middle)
136
- adb shell input swipe 500 2000 500 800 300
137
- ```
138
-
139
- ### Tap
140
-
141
- **Preferred: Use the helper script** with auto coordinate scaling:
142
- ```bash
143
- bash phone_tap.sh 610 1328 # Tap at native coords
144
- bash phone_tap.sh 305 664 -S 50 # Auto-scale from 50% screenshot coords
145
- bash phone_tap.sh 305 664 -S 50 -l # Long press at scaled coords
146
- ```
147
-
148
- Manual approach:
149
- ```bash
150
- # Use NATIVE coordinates (not scaled!)
151
- adb shell input tap X Y
152
-
153
- # Example: tap screen center on 1220x2656 device
154
- adb shell input tap 610 1328
155
- ```
156
-
157
- ### Swipe
158
-
159
- **Preferred: Use the helper script** with directional presets:
160
- ```bash
161
- bash phone_swipe.sh up # Scroll up (auto-detects resolution)
162
- bash phone_swipe.sh down # Scroll down
163
- bash phone_swipe.sh left # Swipe left
164
- bash phone_swipe.sh right # Swipe right
165
- bash phone_swipe.sh 100 2000 100 800 # Custom swipe (native coords)
166
- bash phone_swipe.sh 50 1000 50 400 -S 50 # Custom with auto-scaling
167
- ```
168
-
169
- Manual approach:
170
- ```bash
171
- # adb shell input swipe startX startY endX endY duration_ms
172
- adb shell input swipe X1 Y1 X2 Y2 300
173
-
174
- # Scroll up (view more content)
175
- adb shell input swipe 610 2000 610 800 300
176
-
177
- # Scroll down
178
- adb shell input swipe 610 800 610 2000 300
179
-
180
- # Swipe left (next page)
181
- adb shell input swipe 1000 1328 200 1328 300
182
-
183
- # Swipe right (previous page)
184
- adb shell input swipe 200 1328 1000 1328 300
185
- ```
186
-
187
- ### Key Events
188
-
189
- ```bash
190
- adb shell input keyevent KEYCODE_BACK # Back
191
- adb shell input keyevent KEYCODE_HOME # Home
192
- adb shell input keyevent KEYCODE_WAKEUP # Wake screen
193
- adb shell input keyevent KEYCODE_POWER # Power toggle
194
- adb shell input keyevent KEYCODE_ENTER # Enter/Confirm
195
- adb shell input keyevent KEYCODE_DEL # Backspace/Delete
196
- adb shell input keyevent KEYCODE_VOLUME_UP # Volume up
197
- adb shell input keyevent KEYCODE_VOLUME_DOWN # Volume down
198
- adb shell input keyevent 187 # Recent apps / Multitask
199
- ```
200
-
201
- ### Launch Apps
202
-
203
- **Preferred: Use the helper script** with app name aliases:
204
- ```bash
205
- bash phone_launch.sh wechat # Launch by alias
206
- bash phone_launch.sh com.tencent.mm # Launch by package name
207
- bash phone_launch.sh -l # List all known aliases
208
- ```
209
-
210
- Manual approach:
211
- ```bash
212
- # WeChat
213
- adb shell monkey -p com.tencent.mm -c android.intent.category.LAUNCHER 1
214
-
215
- # WeCom (企业微信)
216
- adb shell monkey -p com.tencent.wework -c android.intent.category.LAUNCHER 1
217
-
218
- # Meituan (美团)
219
- adb shell monkey -p com.sankuai.meituan -c android.intent.category.LAUNCHER 1
220
-
221
- # Ctrip (携程)
222
- adb shell monkey -p ctrip.android.view -c android.intent.category.LAUNCHER 1
223
-
224
- # Xiaohongshu (小红书)
225
- adb shell monkey -p com.xingin.xhs -c android.intent.category.LAUNCHER 1
226
-
227
- # Bilibili (哔哩哔哩)
228
- adb shell monkey -p tv.danmaku.bili -c android.intent.category.LAUNCHER 1
229
-
230
- # Baidu (百度)
231
- adb shell monkey -p com.baidu.searchbox -c android.intent.category.LAUNCHER 1
232
-
233
- # BOSS Zhipin (BOSS直聘)
234
- adb shell monkey -p com.hpbr.bosszhipin -c android.intent.category.LAUNCHER 1
235
-
236
- # Doubao (豆包)
237
- adb shell monkey -p com.larus.nova -c android.intent.category.LAUNCHER 1
238
- ```
239
-
240
- Find any app's package name:
241
- ```bash
242
- adb shell pm list packages | grep -i keyword
243
- ```
244
-
245
- Check which app is in foreground:
246
- ```bash
247
- adb shell dumpsys activity activities | grep "mResumedActivity"
248
- ```
249
-
250
- ## Text Input
251
-
252
- ### English & Numbers (Works directly)
253
-
254
- ```bash
255
- adb shell input text "hello123"
256
- ```
257
-
258
- ### Chinese Input (Known Limitations)
259
-
260
- Direct Chinese input via ADB is **not supported** on Android. All of the following methods have known issues:
261
-
262
- 1. **`adb shell input text '你好'`** → NullPointerException (Android `input text` does not support CJK characters)
263
- 2. **Pinyin via `input text`** → Sogou/other IME nine-grid layouts misparse pinyin segmentation
264
- 3. **Clipboard** → `adb shell cmd clipboard set` returns "No shell command implementation" on many devices
265
- 4. **ADB broadcast** → Requires ADBKeyboard app installed
266
-
267
- ### Chinese Input Workarounds
268
-
269
- - **WeChat/WeCom messages**: Use the agent's messaging API (webhook/notify_user) to send directly, bypassing phone input entirely
270
- - **Search by contact**: Try English names or pinyin initials
271
- - **ADBKeyboard**: If available, install the ADBKeyboard APK to enable `am broadcast` based text input:
272
- ```bash
273
- adb install ADBKeyboard.apk
274
- adb shell ime set com.android.adbkeyboard/.AdbIME
275
- adb shell am broadcast -a ADB_INPUT_TEXT --es msg '你好'
276
- ```
277
-
278
- ## Reading UI State (Alternative to Screenshots)
279
-
280
- **Preferred: Use the helper script** for easy UI parsing:
281
- ```bash
282
- python3 phone_ui_parse.py # Dump & show all elements with bounds
283
- python3 phone_ui_parse.py -c # Only clickable elements
284
- python3 phone_ui_parse.py -s "send" # Filter by text/description
285
- python3 phone_ui_parse.py -c -s "确认" -j # Clickable "confirm" as JSON
286
- python3 phone_ui_parse.py -f /tmp/ui.xml # Parse existing XML file
287
- ```
288
-
289
- Manual approach — dump and parse:
290
- ```bash
291
- # Dump UI accessibility tree
292
- adb shell uiautomator dump /sdcard/ui.xml 2>/dev/null
293
- adb pull /sdcard/ui.xml /tmp/ui.xml
294
- ```
295
-
296
- Parse with Python to find interactive elements:
297
- ```python
298
- import xml.etree.ElementTree as ET, re
299
-
300
- tree = ET.parse('/tmp/ui.xml')
301
- root = tree.getroot()
302
-
303
- for node in root.iter():
304
- text = node.get('text', '')
305
- desc = node.get('content-desc', '')
306
- bounds = node.get('bounds', '')
307
- clickable = node.get('clickable', '')
308
- if bounds and (text or desc):
309
- m = re.findall(r'\d+', bounds)
310
- if len(m) == 4:
311
- cx = (int(m[0]) + int(m[2])) // 2
312
- cy = (int(m[1]) + int(m[3])) // 2
313
- print(f'"{text}" desc="{desc}" center=({cx},{cy}) clickable={clickable}')
314
- ```
315
-
316
- ### Security Keyboard Handling
317
-
318
- Financial apps use randomized security keyboards where digit positions change each time.
319
-
320
- **Preferred: Use the helper script** which auto-detects and types on security keyboards:
321
- ```bash
322
- python3 phone_security_kbd.py "123456" # Type PIN
323
- python3 phone_security_kbd.py "888888" --dry-run # Show digit positions without tapping
324
- python3 phone_security_kbd.py "1234" --delay 300 # Slower tapping (300ms between digits)
325
- ```
326
-
327
- Manual approach — read `content-desc` from UI dump to map digits:
328
- ```python
329
- def get_security_keyboard_map(root):
330
- """Returns dict of digit/char -> (center_x, center_y)"""
331
- kbd = {}
332
- for node in root.iter():
333
- desc = node.get('content-desc', '')
334
- bounds = node.get('bounds', '')
335
- if desc and bounds and node.get('clickable') == 'true':
336
- m = re.findall(r'\d+', bounds)
337
- if len(m) == 4:
338
- cx = (int(m[0]) + int(m[2])) // 2
339
- cy = (int(m[1]) + int(m[3])) // 2
340
- kbd[desc] = (cx, cy)
341
- return kbd
342
- ```
343
-
344
- ## Standard Operating Procedures
345
-
346
- ### Open WeChat and Browse
347
-
348
- **Using helper scripts (recommended):**
349
- ```bash
350
- bash phone_wake_unlock.sh # Wake & unlock
351
- sleep 1
352
- bash phone_launch.sh wechat # Launch WeChat
353
- sleep 2
354
- bash phone_screenshot.sh -s 50 # Take scaled screenshot
355
- # Now read /tmp/phone_screen.png to analyze the interface
356
- ```
357
-
358
- **Manual approach:**
359
- ```bash
360
- # Wake phone
361
- adb shell input keyevent KEYCODE_WAKEUP
362
- sleep 0.5
363
- adb shell input swipe 500 2000 500 800 300 # Unlock
364
- sleep 1
365
-
366
- # Launch WeChat
367
- adb shell monkey -p com.tencent.mm -c android.intent.category.LAUNCHER 1
368
- sleep 2
369
-
370
- # Take screenshot to analyze current view
371
- adb shell screencap -p /sdcard/screen.png
372
- adb pull /sdcard/screen.png /tmp/phone_raw.png
373
- sips --resampleHeightWidth 1328 610 /tmp/phone_raw.png --out /tmp/phone_screen.png
374
- adb shell rm /sdcard/screen.png
375
- # Now read /tmp/phone_screen.png to analyze the interface
376
- ```
377
-
378
- ### Install an APK
379
-
380
- ```bash
381
- # Install from local file
382
- adb install -r /path/to/app.apk
383
-
384
- # Or push then install (more reliable for large APKs)
385
- adb push /path/to/app.apk /data/local/tmp/app.apk
386
- adb shell pm install -r /data/local/tmp/app.apk
387
- ```
388
-
389
- ## Troubleshooting
390
-
391
- | Problem | Cause | Solution |
392
- |---------|-------|----------|
393
- | Screenshot is all black | Phone screen is off / asleep | `adb shell input keyevent KEYCODE_WAKEUP` + swipe to unlock |
394
- | Screenshot too large to analyze | Native resolution too high | Scale to 50% with `sips` before reading |
395
- | Device shows "unauthorized" | USB debugging not approved | Notify user to tap "Allow" on phone screen |
396
- | adb quarantined on macOS | Gatekeeper blocked binary | `sudo xattr -rd com.apple.quarantine $(which adb)` |
397
- | Chinese text can't be typed | Android `input text` CJK limitation | Use webhook/API, or install ADBKeyboard |
398
- | Tap lands on wrong position | Coordinates not scaled back | Multiply scaled coordinates by scale factor for native coords |
399
- | Pinyin input garbled | Sogou nine-grid layout | Avoid pinyin via `input text`; use ADBKeyboard or API |
400
- | INJECT_EVENTS SecurityException | Xiaomi USB debug security setting | Enable "USB调试(安全设置)" in Developer Options |
401
- | FLAG_SECURE blocks screenshot | App security policy | Use `uiautomator dump` to read UI tree instead |
402
- | Screen locks during operation | Auto-lock timeout | `adb shell settings put system screen_off_timeout 1800000` |
403
-
404
- ## Best Practices
405
-
406
- 1. **Always screenshot before acting** — never tap blindly; confirm the current UI state first
407
- 2. **Wait after interactions** — `sleep 1-2` seconds after tap/swipe before taking the next screenshot
408
- 3. **Always scale coordinates** — read from scaled image, execute with native coordinates
409
- 4. **Clean up temp files** — delete screenshots from both phone (`/sdcard/`) and local (`/tmp/`) after use
410
- 5. **Avoid rapid screenshots** — each involves device storage I/O; don't use in tight loops
411
- 6. **Re-wake if idle** — phone may auto-lock after inactivity; always check screen state first
412
- 7. **Use UI dump for secure apps** — FLAG_SECURE apps return black screenshots; `uiautomator dump` still works
@@ -1,132 +0,0 @@
1
- #!/usr/bin/env bash
2
- # phone_input.sh - Input text on Android phone via ADB
3
- #
4
- # Usage:
5
- # bash phone_input.sh <text> [options]
6
- #
7
- # Options:
8
- # -d DEVICE_ID Target specific device
9
- # -m METHOD Input method: "direct" (default), "adbkbd", "clipboard"
10
- # -e Send Enter key after input
11
- # -c Clear existing text first (select all + delete)
12
- # -q Quiet mode
13
- #
14
- # Methods:
15
- # direct - adb shell input text (English/numbers only)
16
- # adbkbd - ADBKeyboard broadcast (supports Chinese, requires ADBKeyboard app)
17
- # clipboard - Set clipboard + paste (requires Android 10+, may not work on all devices)
18
- #
19
- # Notes:
20
- # - Direct method does NOT support Chinese/CJK characters
21
- # - Special characters (spaces, quotes) are escaped automatically
22
- # - For Chinese input, install ADBKeyboard and use -m adbkbd
23
- #
24
- # Examples:
25
- # bash phone_input.sh "hello world" # English text
26
- # bash phone_input.sh "hello" -e # Type and press Enter
27
- # bash phone_input.sh "你好" -m adbkbd # Chinese via ADBKeyboard
28
- # bash phone_input.sh "new text" -c # Clear field first, then type
29
-
30
- set -euo pipefail
31
-
32
- DEVICE=""
33
- METHOD="direct"
34
- ENTER=false
35
- CLEAR=false
36
- QUIET=false
37
-
38
- # Parse args
39
- POSITIONAL=()
40
- ARGV=("$@")
41
- i=0
42
- while [ $i -lt ${#ARGV[@]} ]; do
43
- case "${ARGV[$i]}" in
44
- -d) DEVICE="${ARGV[$((i+1))]}"; i=$((i+2)) ;;
45
- -m) METHOD="${ARGV[$((i+1))]}"; i=$((i+2)) ;;
46
- -e) ENTER=true; i=$((i+1)) ;;
47
- -c) CLEAR=true; i=$((i+1)) ;;
48
- -q) QUIET=true; i=$((i+1)) ;;
49
- -*) echo "Unknown option: ${ARGV[$i]}" >&2; exit 1 ;;
50
- *) POSITIONAL+=("${ARGV[$i]}"); i=$((i+1)) ;;
51
- esac
52
- done
53
-
54
- if [ ${#POSITIONAL[@]} -eq 0 ]; then
55
- echo "Usage: $0 <text> [-d device] [-m method] [-e] [-c] [-q]" >&2
56
- exit 1
57
- fi
58
-
59
- TEXT="${POSITIONAL[0]}"
60
-
61
- ADB_CMD="adb"
62
- if [ -n "$DEVICE" ]; then
63
- ADB_CMD="adb -s $DEVICE"
64
- fi
65
-
66
- info() { $QUIET || echo "[phone_input] $*" >&2; }
67
- warn() { echo "[phone_input] WARNING: $*" >&2; }
68
- error() { echo "[phone_input] ERROR: $*" >&2; }
69
-
70
- # Clear existing text if requested
71
- if $CLEAR; then
72
- info "Clearing existing text..."
73
- # Select all (Ctrl+A) then delete
74
- $ADB_CMD shell input keyevent KEYCODE_MOVE_HOME
75
- $ADB_CMD shell input keyevent --longpress KEYCODE_SHIFT_LEFT KEYCODE_MOVE_END 2>/dev/null || true
76
- $ADB_CMD shell input keyevent KEYCODE_DEL
77
- sleep 0.3
78
- fi
79
-
80
- case "$METHOD" in
81
- direct)
82
- # Escape special characters for adb shell input text
83
- # Replace spaces with %s, & with \&, etc.
84
- ESCAPED=$(echo "$TEXT" | sed 's/ /%s/g; s/&/\\&/g; s/</\\</g; s/>/\\>/g; s/(/\\(/g; s/)/\\)/g; s/|/\\|/g; s/;/\\;/g; s/"/\\"/g; s/'"'"'/\\'"'"'/g')
85
- info "Typing (direct): $TEXT"
86
- $ADB_CMD shell input text "$ESCAPED"
87
- ;;
88
- adbkbd)
89
- info "Typing (ADBKeyboard): $TEXT"
90
- # Check if ADBKeyboard is available; auto-install if missing
91
- if ! $ADB_CMD shell ime list -s 2>/dev/null | grep -q "adbkeyboard"; then
92
- warn "ADBKeyboard not found. Attempting auto-install..."
93
- SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
94
- if [ -f "$SCRIPT_DIR/phone_setup.sh" ]; then
95
- bash "$SCRIPT_DIR/phone_setup.sh" ${DEVICE:+-d "$DEVICE"} -q || {
96
- error "Auto-install failed. Run manually: bash phone_setup.sh"
97
- exit 1
98
- }
99
- info "ADBKeyboard auto-installed successfully"
100
- else
101
- echo "ERROR: ADBKeyboard not installed. Run phone_setup.sh first:" >&2
102
- echo " bash phone_setup.sh" >&2
103
- exit 1
104
- fi
105
- fi
106
- $ADB_CMD shell am broadcast -a ADB_INPUT_TEXT --es msg "$TEXT" 2>/dev/null
107
- ;;
108
- clipboard)
109
- info "Typing (clipboard): $TEXT"
110
- # Try to set clipboard and paste
111
- $ADB_CMD shell "echo '$TEXT' | cmd clipboard set_text" 2>/dev/null || {
112
- echo "ERROR: Clipboard method not supported on this device" >&2
113
- exit 1
114
- }
115
- # Paste: Ctrl+V
116
- $ADB_CMD shell input keyevent 279 # KEYCODE_PASTE
117
- ;;
118
- *)
119
- echo "ERROR: Unknown method '$METHOD'. Use: direct, adbkbd, clipboard" >&2
120
- exit 1
121
- ;;
122
- esac
123
-
124
- # Press Enter if requested
125
- if $ENTER; then
126
- sleep 0.2
127
- $ADB_CMD shell input keyevent KEYCODE_ENTER
128
- info "Pressed Enter"
129
- fi
130
-
131
- info "Done."
132
- echo "input_ok"