physiclaw 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. physiclaw-0.0.1/.claude/settings.local.json +23 -0
  2. physiclaw-0.0.1/.claude/skills/calibrate-keyboard/SKILL.md +137 -0
  3. physiclaw-0.0.1/.claude/skills/calibrate-keyboard/template.md +8 -0
  4. physiclaw-0.0.1/.claude/skills/cron/SKILL.md +115 -0
  5. physiclaw-0.0.1/.claude/skills/jd/SKILL.md +38 -0
  6. physiclaw-0.0.1/.claude/skills/open-app/SKILL.md +64 -0
  7. physiclaw-0.0.1/.claude/skills/phone-setup/SKILL.md +117 -0
  8. physiclaw-0.0.1/.claude/skills/setup/SKILL.md +15 -0
  9. physiclaw-0.0.1/.claude/skills/setup-vision-models/SKILL.md +38 -0
  10. physiclaw-0.0.1/.claude/skills/wechat/SKILL.md +100 -0
  11. physiclaw-0.0.1/.gitignore +38 -0
  12. physiclaw-0.0.1/.python-version +1 -0
  13. physiclaw-0.0.1/PKG-INFO +271 -0
  14. physiclaw-0.0.1/PhysiClaw_Report.md +356 -0
  15. physiclaw-0.0.1/QWERTY_KEYBOARD_DETECTOR_PLAN.md +381 -0
  16. physiclaw-0.0.1/README.md +245 -0
  17. physiclaw-0.0.1/docs/.gitkeep +0 -0
  18. physiclaw-0.0.1/physiclaw_architecture_plan.md +803 -0
  19. physiclaw-0.0.1/physiclaw_vlm_tap_plan.md +132 -0
  20. physiclaw-0.0.1/pyproject.toml +52 -0
  21. physiclaw-0.0.1/scripts/calibrate_keyboard.py +125 -0
  22. physiclaw-0.0.1/scripts/demo_agent.py +73 -0
  23. physiclaw-0.0.1/scripts/download_omniparser.py +50 -0
  24. physiclaw-0.0.1/scripts/dump_prompt.py +68 -0
  25. physiclaw-0.0.1/scripts/setup.py +304 -0
  26. physiclaw-0.0.1/scripts/stylus_gen.py +206 -0
  27. physiclaw-0.0.1/scripts/stylus_holder.py +687 -0
  28. physiclaw-0.0.1/skills/search-in-app/SKILL.md +33 -0
  29. physiclaw-0.0.1/src/physiclaw/__init__.py +5 -0
  30. physiclaw-0.0.1/src/physiclaw/agent/__init__.py +1 -0
  31. physiclaw-0.0.1/src/physiclaw/agent/context/AGENT.md +51 -0
  32. physiclaw-0.0.1/src/physiclaw/agent/context/CONVENTION.md +114 -0
  33. physiclaw-0.0.1/src/physiclaw/agent/context/IDENTITY.md +6 -0
  34. physiclaw-0.0.1/src/physiclaw/agent/context/JOBS.md +86 -0
  35. physiclaw-0.0.1/src/physiclaw/agent/context/PERSISTENCE.md +37 -0
  36. physiclaw-0.0.1/src/physiclaw/agent/context/PHYSICLAW.md +58 -0
  37. physiclaw-0.0.1/src/physiclaw/agent/context/SOUL.md +17 -0
  38. physiclaw-0.0.1/src/physiclaw/agent/engine/__init__.py +4 -0
  39. physiclaw-0.0.1/src/physiclaw/agent/engine/builtin_tool.py +631 -0
  40. physiclaw-0.0.1/src/physiclaw/agent/engine/compact.py +197 -0
  41. physiclaw-0.0.1/src/physiclaw/agent/engine/dto.py +59 -0
  42. physiclaw-0.0.1/src/physiclaw/agent/engine/engine.py +447 -0
  43. physiclaw-0.0.1/src/physiclaw/agent/engine/job_store.py +394 -0
  44. physiclaw-0.0.1/src/physiclaw/agent/engine/jobs.py +211 -0
  45. physiclaw-0.0.1/src/physiclaw/agent/engine/mcp_inventory.py +60 -0
  46. physiclaw-0.0.1/src/physiclaw/agent/engine/mcp_tool.py +148 -0
  47. physiclaw-0.0.1/src/physiclaw/agent/engine/memory.py +119 -0
  48. physiclaw-0.0.1/src/physiclaw/agent/engine/plan.py +146 -0
  49. physiclaw-0.0.1/src/physiclaw/agent/engine/prompt.py +292 -0
  50. physiclaw-0.0.1/src/physiclaw/agent/engine/provider.py +329 -0
  51. physiclaw-0.0.1/src/physiclaw/agent/engine/skill.py +144 -0
  52. physiclaw-0.0.1/src/physiclaw/agent/engine/trace.py +357 -0
  53. physiclaw-0.0.1/src/physiclaw/agent/engine/validator.py +142 -0
  54. physiclaw-0.0.1/src/physiclaw/agent/hooks/__init__.py +7 -0
  55. physiclaw-0.0.1/src/physiclaw/agent/hooks/cron.py +215 -0
  56. physiclaw-0.0.1/src/physiclaw/agent/hooks/poll.py +48 -0
  57. physiclaw-0.0.1/src/physiclaw/agent/runtime/__init__.py +4 -0
  58. physiclaw-0.0.1/src/physiclaw/agent/runtime/__main__.py +5 -0
  59. physiclaw-0.0.1/src/physiclaw/agent/runtime/claude.py +248 -0
  60. physiclaw-0.0.1/src/physiclaw/agent/runtime/config.py +15 -0
  61. physiclaw-0.0.1/src/physiclaw/agent/runtime/hook.py +117 -0
  62. physiclaw-0.0.1/src/physiclaw/agent/runtime/launcher.py +99 -0
  63. physiclaw-0.0.1/src/physiclaw/agent/runtime/runtime.py +128 -0
  64. physiclaw-0.0.1/src/physiclaw/agent/runtime/sentinel.py +32 -0
  65. physiclaw-0.0.1/src/physiclaw/core/__init__.py +5 -0
  66. physiclaw-0.0.1/src/physiclaw/core/bridge/__init__.py +21 -0
  67. physiclaw-0.0.1/src/physiclaw/core/bridge/calib.py +184 -0
  68. physiclaw-0.0.1/src/physiclaw/core/bridge/handler.py +149 -0
  69. physiclaw-0.0.1/src/physiclaw/core/bridge/lan.py +74 -0
  70. physiclaw-0.0.1/src/physiclaw/core/bridge/nonce.py +74 -0
  71. physiclaw-0.0.1/src/physiclaw/core/bridge/page.py +46 -0
  72. physiclaw-0.0.1/src/physiclaw/core/bridge/state.py +156 -0
  73. physiclaw-0.0.1/src/physiclaw/core/calibration/__init__.py +11 -0
  74. physiclaw-0.0.1/src/physiclaw/core/calibration/calibrate.py +958 -0
  75. physiclaw-0.0.1/src/physiclaw/core/calibration/handler.py +314 -0
  76. physiclaw-0.0.1/src/physiclaw/core/calibration/state.py +159 -0
  77. physiclaw-0.0.1/src/physiclaw/core/calibration/transforms.py +133 -0
  78. physiclaw-0.0.1/src/physiclaw/core/hardware/__init__.py +17 -0
  79. physiclaw-0.0.1/src/physiclaw/core/hardware/arm.py +391 -0
  80. physiclaw-0.0.1/src/physiclaw/core/hardware/camera.py +269 -0
  81. physiclaw-0.0.1/src/physiclaw/core/hardware/grbl.py +86 -0
  82. physiclaw-0.0.1/src/physiclaw/core/hardware/handler.py +178 -0
  83. physiclaw-0.0.1/src/physiclaw/core/hardware/iphone.py +148 -0
  84. physiclaw-0.0.1/src/physiclaw/core/logger/__init__.py +10 -0
  85. physiclaw-0.0.1/src/physiclaw/core/logger/dumps.py +66 -0
  86. physiclaw-0.0.1/src/physiclaw/core/logger/logger.py +119 -0
  87. physiclaw-0.0.1/src/physiclaw/core/main.py +194 -0
  88. physiclaw-0.0.1/src/physiclaw/core/orchestration/__init__.py +10 -0
  89. physiclaw-0.0.1/src/physiclaw/core/orchestration/orchestrator.py +586 -0
  90. physiclaw-0.0.1/src/physiclaw/core/server/__init__.py +16 -0
  91. physiclaw-0.0.1/src/physiclaw/core/server/app.py +63 -0
  92. physiclaw-0.0.1/src/physiclaw/core/server/bridge.py +65 -0
  93. physiclaw-0.0.1/src/physiclaw/core/server/calibration.py +57 -0
  94. physiclaw-0.0.1/src/physiclaw/core/server/hardware.py +32 -0
  95. physiclaw-0.0.1/src/physiclaw/core/server/mcp.py +25 -0
  96. physiclaw-0.0.1/src/physiclaw/core/server/tools.py +262 -0
  97. physiclaw-0.0.1/src/physiclaw/core/server/types.py +46 -0
  98. physiclaw-0.0.1/src/physiclaw/core/server/warm_start.py +188 -0
  99. physiclaw-0.0.1/src/physiclaw/core/server/watch.py +49 -0
  100. physiclaw-0.0.1/src/physiclaw/core/static/bridge.html +447 -0
  101. physiclaw-0.0.1/src/physiclaw/core/static/qr.html +39 -0
  102. physiclaw-0.0.1/src/physiclaw/core/vision/__init__.py +32 -0
  103. physiclaw-0.0.1/src/physiclaw/core/vision/grid_detect.py +140 -0
  104. physiclaw-0.0.1/src/physiclaw/core/vision/icon_detect.py +194 -0
  105. physiclaw-0.0.1/src/physiclaw/core/vision/keyboard.py +459 -0
  106. physiclaw-0.0.1/src/physiclaw/core/vision/ocr.py +194 -0
  107. physiclaw-0.0.1/src/physiclaw/core/vision/render.py +84 -0
  108. physiclaw-0.0.1/src/physiclaw/core/vision/screen_match.py +201 -0
  109. physiclaw-0.0.1/src/physiclaw/core/vision/ui_elements.py +219 -0
  110. physiclaw-0.0.1/src/physiclaw/core/vision/util.py +464 -0
  111. physiclaw-0.0.1/src/physiclaw/core/vision/watchdog.py +155 -0
  112. physiclaw-0.0.1/uv.lock +2302 -0
@@ -0,0 +1,23 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "mcp__physiclaw__screenshot",
5
+ "mcp__physiclaw__move",
6
+ "mcp__physiclaw__swipe",
7
+ "mcp__physiclaw__park",
8
+ "mcp__physiclaw__bbox_target",
9
+ "mcp__physiclaw__confirm_bbox",
10
+ "mcp__physiclaw__tap",
11
+ "mcp__physiclaw__long_press",
12
+ "mcp__physiclaw__grid_overlay",
13
+ "Bash(uv run:*)",
14
+ "Bash(git mv:*)",
15
+ "Bash(git rm:*)",
16
+ "WebFetch(domain:raw.githubusercontent.com)",
17
+ "WebSearch",
18
+ "WebFetch(domain:www.anthropic.com)",
19
+ "Bash(python3 -c \"import json,sys; d=json.load\\(sys.stdin\\); [print\\(i['path']\\) for i in d.get\\('tree',[]\\) if 'prompt' in i['path'].lower\\(\\) or 'system' in i['path'].lower\\(\\) or 'instruct' in i['path'].lower\\(\\)]\")",
20
+ "Bash(uv pip *)"
21
+ ]
22
+ }
23
+ }
@@ -0,0 +1,137 @@
1
+ ---
2
+ name: calibrate-keyboard
3
+ description: Detect keyboard keys from phone screenshots and generate UI preset for typing. Use when setting up a new phone or the keyboard layout has changed.
4
+ allowed-tools: Bash, Read, Edit, Write
5
+ ---
6
+
7
+ # Keyboard Calibration
8
+
9
+ Guide the user through calibrating the on-screen keyboard for PhysiClaw.
10
+
11
+ ## Step 1: Collect screenshots
12
+
13
+ Ask the user to take two phone screenshots with the keyboard visible:
14
+
15
+ 1. **Alpha keyboard** (default QWERTY layout)
16
+ 2. **Numeric keyboard** (tap the 123 key first)
17
+
18
+ Save both as PNG/JPG in `data/image/keyboard/`. Remove any old images from that directory first.
19
+
20
+ ## Step 2: Check images and run detection
21
+
22
+ Run this check first:
23
+
24
+ ```bash
25
+ uv run python -c "
26
+ import cv2
27
+ from pathlib import Path
28
+ imgs = sorted(Path('data/image/keyboard').glob('*.*'))
29
+ imgs = [p for p in imgs if p.suffix.lower() in ('.png', '.jpg', '.jpeg')]
30
+ print(f'{len(imgs)} images found')
31
+ sizes = set()
32
+ for p in imgs:
33
+ img = cv2.imread(str(p))
34
+ if img is not None:
35
+ sizes.add((img.shape[1], img.shape[0]))
36
+ print(f' {p.name}: {img.shape[1]}x{img.shape[0]}')
37
+ if len(sizes) == 1:
38
+ print('All same size')
39
+ elif len(sizes) > 1:
40
+ print(f'ERROR: different sizes: {sizes}')
41
+ if len(imgs) < 2:
42
+ print('ERROR: need at least 2 images (alpha + numeric)')
43
+ "
44
+ ```
45
+
46
+ Verify:
47
+
48
+ 1. At least 2 images (one alpha, one numeric)
49
+ 2. All images have the same width and height (same phone, same orientation)
50
+ 3. The keyboard background is clean and uniform (no custom themes, no wallpaper keyboards)
51
+ 4. The keyboard is the system default (Gboard, iOS keyboard, etc.) -- not a third-party keyboard
52
+ 5. Original images, no resize, no editing
53
+
54
+ If any check fails, ask the user to fix and re-screenshot.
55
+
56
+ Then run detection:
57
+
58
+ ```bash
59
+ uv run python scripts/calibrate_keyboard.py
60
+ ```
61
+
62
+ This detects key bounding boxes and generates:
63
+
64
+ - Bounding box images in `data/image/keyboard/bbox/`
65
+ - A preset template at `.claude/ui-presets/system-keyboard.md` with positions filled in
66
+
67
+ Check the output: it should report 4 rows per keyboard, ~33 keys for alpha, ~35 for numeric.
68
+ If detection fails or key counts are wrong, ask the user to retake screenshots (original image, no resize, no editing).
69
+
70
+ ## Step 3: Fill ??? labels
71
+
72
+ Keys marked ??? need to be identified. After filling, tell the user:
73
+ "You can open `.claude/ui-presets/system-keyboard.md` to check my editing."
74
+
75
+ For each keyboard page:
76
+
77
+ 1. Tell the user: "Please open `{bbox image path}` to verify my labels." (the path is listed under each page heading)
78
+ 2. Read the bounding box image yourself to identify keys
79
+ 3. Fill ??? entries one page at a time
80
+ 4. Always refer to keys by bbox index: "bbox 11: @, bbox 12: #"
81
+ 5. After filling each page, list your guesses and ask the user to confirm
82
+ 6. Punctuation/symbols are hard to distinguish -- always ask the user to verify these
83
+
84
+ ### Symbol reference for punctuation
85
+
86
+ When identifying symbol keys, select from this list.
87
+ Note: Chinese Pinyin keyboards have no straight quotes (' "). Only curly quotes and enumeration comma.
88
+ If a symbol looks like ' or `, it is most likely 、 (enumeration comma).
89
+
90
+ | Symbol | Name | Type |
91
+ | -- | -------------- | ---- |
92
+ | @ | at sign | English |
93
+ | # | hash / pound | English |
94
+ | $ | dollar sign | English |
95
+ | _ | underscore | English |
96
+ | & | ampersand | English |
97
+ | * | asterisk | English |
98
+ | - | hyphen | English |
99
+ | + | plus | English |
100
+ | / | slash | English |
101
+ | , | comma | English |
102
+ | . | period | English |
103
+ | : | colon | English |
104
+ | ; | semicolon | English |
105
+ | ! | exclamation mark | English |
106
+ | ? | question mark | English |
107
+ | ( | left parenthesis | English |
108
+ | ) | right parenthesis | English |
109
+ | ( | left parenthesis (Chinese) | Chinese |
110
+ | ) | right parenthesis (Chinese) | Chinese |
111
+ | 、 | enumeration comma | Chinese |
112
+ | " | left double quote (Chinese) | Chinese |
113
+ | " | right double quote (Chinese) | Chinese |
114
+ | : | colon (Chinese) | Chinese |
115
+ | ; | semicolon (Chinese) | Chinese |
116
+ | ! | exclamation mark (Chinese) | Chinese |
117
+ | ? | question mark (Chinese) | Chinese |
118
+ | , | comma (Chinese) | Chinese |
119
+ | 。 | period (Chinese) | Chinese |
120
+ | ¥ | yen / RMB sign | Currency |
121
+
122
+ ## Step 4: Verify positions unchanged
123
+
124
+ After filling all ???, verify that no Position values were accidentally modified by comparing
125
+ the Position columns of the filled file against the reference copy:
126
+
127
+ ```bash
128
+ diff <(grep -oP '\[[\d., ]+\]' .claude/ui-presets/system-keyboard.md) \
129
+ <(grep -oP '\[[\d., ]+\]' data/image/keyboard/bbox/system-keyboard.ref.md)
130
+ ```
131
+
132
+ If no output, all positions match. If there are differences, the Position column was accidentally
133
+ edited — restore those rows from the reference file.
134
+
135
+ Then ask the user to review the final `.claude/ui-presets/system-keyboard.md`.
136
+
137
+ Done. The AI agent can now use the keyboard preset for typing.
@@ -0,0 +1,8 @@
1
+ # System Keyboard
2
+
3
+ To type uppercase: tap ⇧ Shift first, then tap the letter. Shift auto-reverts to lowercase after one letter.
4
+ To type a digit or symbol: tap ?123 to switch to Numeric Keyboard, type, then tap 返回(ABC) to switch back.
5
+ To type Chinese (Pinyin): type the pinyin letters, then tap Space to confirm the first candidate. E.g. to type "你好", tap n-i-h-a-o then tap Space.
6
+ To hide the keyboard: tap the back button or tap outside the text field.
7
+
8
+ {{pages}}
@@ -0,0 +1,115 @@
1
+ ---
2
+ name: cron
3
+ description: Manage scheduled jobs in jobs/jobs.md — list, add, or cancel cron entries. Verifies the file parses before and after every change.
4
+ allowed-tools: Bash, Read, Edit, Write
5
+ ---
6
+
7
+ # Cron Job Management
8
+
9
+ The runtime checks `jobs/jobs.md` every minute. Each `## <id>` section
10
+ is a scheduled job. When a job's `Schedule:` matches the current minute
11
+ and its `Status:` is `pend`, the hook fires a `Trigger` and the
12
+ runtime spawns Claude Code with the job's context.
13
+
14
+ **Jobs are never deleted.** They transition through statuses:
15
+ `pend` → `fired` (hook) → `pend` (periodic done/fail) or `done`/`fail` (one-time).
16
+ `pend` → `cancel` (user).
17
+
18
+ ## CLI tools
19
+
20
+ ```bash
21
+ uv run python -m physiclaw.agent.hooks.cron verify # parse + list every job
22
+ uv run python -m physiclaw.agent.hooks.cron jobs-to-do # list fired jobs awaiting agent execution
23
+ uv run python -m physiclaw.agent.hooks.cron done <id> <result> # agent: mark complete with result
24
+ uv run python -m physiclaw.agent.hooks.cron fail <id> <reason> # agent: mark failed with reason
25
+ uv run python -m physiclaw.agent.hooks.cron cancel <id> # cancel a job
26
+ ```
27
+
28
+ Always run `verify` first, and again after any change.
29
+
30
+ ### Layout
31
+
32
+ 1. `## <id>` heading — lowercase kebab-case
33
+ 2. **Description** — one plain line below the heading (not a list item).
34
+ For the user: "what is this job?"
35
+ 3. **Fields** — markdown list (`- Key: value`).
36
+
37
+ **Description vs Context**: Description is for the user scanning the
38
+ file. Context is for the agent executing the job — include everything
39
+ Claude needs: which app to open, what to do, edge cases.
40
+
41
+ ### Fields
42
+
43
+ | Field | Set by | Notes |
44
+ |--------------------|----------|--------------------------------------------------|
45
+ | description line | skill | One line under heading (required) |
46
+ | `Type:` | skill | `periodic` or `one-time` (required) |
47
+ | `Schedule:` | skill | 5-field cron in backticks (required) |
48
+ | `Create time:` | skill | ISO timestamp when the job was added |
49
+ | `Next fire time:` | skill | Computed after each fire |
50
+ | `Last fire time:` | hook | When the hook last fired this job |
51
+ | `Execution time:` | agent | When Claude finished — set via `done`/`fail` CLI |
52
+ | `Execution result:`| agent | One-line result summary — set via `done`/`fail` |
53
+ | `Status:` | mixed | `pend`, `fired`, `cancel`, `done`, or `fail` |
54
+ | `Context:` | skill | Full agent instructions (required, last field) |
55
+
56
+ **Status lifecycle:**
57
+
58
+ - `pend` — waiting for next fire time to arrive
59
+ - `fired` — hook fired the job, awaiting agent execution
60
+ - `cancel` — user cancelled; never fires
61
+ - `done` — agent finished a one-time job (`done <id>`)
62
+ - `fail` — agent failed a job (`fail <id>`)
63
+
64
+ **Transitions:**
65
+ `pend` → `fired` (hook fires) → `pend` (periodic done/fail) or
66
+ `done`/`fail` (one-time). `pend` → `cancel` (user).
67
+
68
+ **Time format:** ISO 8601 truncated to minute, naive local time.
69
+ Example: `2026-04-09T09:00`. Unfired fields use `(never)`.
70
+
71
+ ## Workflows
72
+
73
+ ### List jobs
74
+
75
+ 1. Run verify
76
+ 2. Show each job's id, description, type, schedule, status, and times
77
+
78
+ ### Add a job
79
+
80
+ 1. Run verify
81
+ 2. Based on user request, determine: **id**, **description**, **type**,
82
+ **schedule**, **context**
83
+ 3. Create `jobs/jobs.md` if it doesn't exist:
84
+
85
+ ```markdown
86
+ # Cron Jobs
87
+ ```
88
+
89
+ 4. Append using this template:
90
+
91
+ ```markdown
92
+ ## <id (e.g. `weather-check`, `mom-birthday`)>
93
+
94
+ <one-line description>
95
+
96
+ - Type: <periodic|one-time>
97
+ - Schedule: `<cron expression>`
98
+ - Create time: <now as ISO>
99
+ - Next fire time: <compute from schedule and current time, fill as ISO>
100
+ - Last fire time: (never)
101
+ - Execution time: (never)
102
+ - Execution result: (never)
103
+ - Status: pend
104
+ - Context: <full instructions for the agent>
105
+ ```
106
+
107
+ 5. Run verify (this also validates Next fire time matches the schedule)
108
+ 6. Tell the user when the job will next fire
109
+
110
+ ### Cancel a job
111
+
112
+ 1. Run verify
113
+ 2. Read `jobs/jobs.md`, confirm which job to cancel with the user
114
+ 3. Edit the `Status:` field to `cancel`
115
+ 4. Run verify
@@ -0,0 +1,38 @@
1
+ ---
2
+ name: jd
3
+ description: Use when the task is grocery / fresh-food shopping via JD / 京东 / 七鲜 / 7Fresh — owner asks to buy 零食, 水果, 蔬菜, 日用品, or to place an order on 京东. NOT for clothing, electronics, or non-grocery shopping, NOT for other grocery apps (Meituan / 盒马 / Dingdong).
4
+ ---
5
+
6
+ # JD (京东) — Grocery shopping
7
+
8
+ Use **京东七鲜** (JD 7Fresh) for groceries. Other JD categories need explicit owner ask.
9
+
10
+ ## Tool choice
11
+
12
+ Prefer `peek()` — cheap, no app-side reactions. Reach for `screenshot()` only when the target is icon-only (no text label). **`screenshot()` on a commodity detail page triggers the share overlay** — see Gotcha.
13
+
14
+ ## Flow
15
+
16
+ 1. `/open-app 京东` (or `JD`). Make sure you're on the **首页** tab (top nav), then tap into 京东七鲜.
17
+ 2. Tap the search box. If it has stale text, tap **backspace** (bbox in PHYSICLAW.md "iPhone keyboard bboxes") until empty — see also `Skill(name="search-in-app")` for the full clear-paste-submit flow. Type/paste the item, open its shop page.
18
+ 3. Tap 加入购物车 (Add to cart).
19
+ - **A spec-selection sheet often slides up** (size, brand, weight). Pick the right spec, then tap **确定** to confirm. The sheet dismisses, the item is added, and you land back on the product page.
20
+ - Items without variants skip the sheet — one tap on 加入购物车 adds directly.
21
+ - **NEVER tap 加入购物车 again on the product page.** The product page looks the same before and after add — you cannot tell from the layout alone whether the add succeeded. Re-tapping just re-opens the spec sheet, you re-confirm, and you've now added the item TWICE. Trust that 确定 worked; verify by checking the cart-icon badge count (top-right corner of the page) or by going to the cart in step 4.
22
+ 4. Tap the cart icon, review line items, tap 去结算. **Always review the cart here** — if quantities look wrong (item appearing twice when you only meant once), that's a sign step 3 was tapped twice; remove the dup before checkout.
23
+ 5. Send the owner: item, qty, price, address, fees, ETA. Wait for explicit OK.
24
+ 6. Tap 提交订单 / 立即支付.
25
+
26
+ ## Gotcha — screenshot triggers share popup
27
+
28
+ JD intercepts the iOS screenshot gesture and overlays a 分享截屏 menu (朋友圈 / QQ / 微信好友 / 保存图片 / 搜问题). The screenshot captured the real page first; recover without re-shooting:
29
+
30
+ 1. `screenshot()` — pixel-perfect bboxes, listing captured.
31
+ 2. `peek()` — if it shows share-sheet text + dim area, the popup is covering the page.
32
+ 3. Dismiss the popup (see below).
33
+ 4. `peek()` — confirm the overlay is gone.
34
+ 5. **Act on labelled targets from the screenshot's text rows.** After the peek stubs the screenshot, icon rows are dropped but text rows (`加入购物车`, product title, price, `去结算`) survive in the stubbed listing — tap those. For the `+` add-to-cart button on list rows, it's typically detected as a `[text]` row too (OCR reads the `+` glyph), so it survives. If what you need is purely icon-only (e.g. the cart-badge icon in the top-right), re-`screenshot` — that's rare.
35
+
36
+ ## Dismiss popup / share sheet / bottom sheet
37
+
38
+ Tap the **dimmed area** above the popup. Safe target: `[0.05, 0.40, 0.15, 0.60]` (center ≈ x=0.10, y=0.50 — left edge, vertical middle). The mirror bbox on the right edge `[0.85, 0.40, 0.95, 0.60]` works too.
@@ -0,0 +1,64 @@
1
+ ---
2
+ name: open-app
3
+ description: Use when you need to launch an app that is NOT on the current screen and NOT in the dock — any time "open <app>" is the next step and you can't see the icon. Use FIRST before tapping blindly. NOT needed when the target app's icon is already visible.
4
+ ---
5
+
6
+ # Open App via Spotlight
7
+
8
+ **Argument:** App name (e.g., "美团", "WeChat", "Safari").
9
+
10
+ ## Steps
11
+
12
+ Each step is one `[note, one-other]` turn. `<name>` placeholders
13
+ refer to the **Fixed elements** table at the bottom.
14
+
15
+ 1. `send_to_clipboard(text="<app name>")`
16
+ — copy the app name (use the exact text the owner asked for, e.g.
17
+ `"美团"`, `"WeChat"`).
18
+ 2. `home_screen()` — return to a clean launch pad. **Skip if you're
19
+ already on the home screen** (peek shows app-icon grid + dock,
20
+ no in-app chrome) — slow physical motion (~2s); don't waste it.
21
+ 3. `swipe(bbox=<spotlight-pull>, direction="down", size="l")`
22
+ — open Spotlight. Bbox is mid-screen (NOT the top edge — that
23
+ opens Notification Center). `size="l"` (~4cm) avoids overshoot.
24
+ 4. `peek()` — refresh listing; the search field and keyboard should
25
+ be visible.
26
+ 5. **If the field has stale text** (no "搜索"/"Search" placeholder),
27
+ clear it via tap+backspace. Chain with ONE `sequence` call (still
28
+ one tool call, so the turn remains `[note, one-other]`):
29
+
30
+ ```python
31
+ sequence(
32
+ step1={"tool_name": "tap", "arg": <search-field>},
33
+ step2={"tool_name": "tap", "arg": <backspace>},
34
+ step3={"tool_name": "tap", "arg": <backspace>},
35
+ step4={"tool_name": "tap", "arg": <backspace>},
36
+ step5={"tool_name": "tap", "arg": <backspace>},
37
+ )
38
+ ```
39
+ Tap 1 focuses; taps 2-5 each delete one char. Over-tapping an
40
+ empty field is a no-op — safe to over-estimate. `peek` to verify;
41
+ if text remains, run another sequence. **Skip when the field is
42
+ empty.** Prefer tap-backspace over `long_press(backspace)` —
43
+ per-tap deletions are deterministic.
44
+ 6. `long_press(bbox=<search-field>)` — opens the Paste popover above
45
+ the search field. If no popover appears after long-press, you
46
+ tapped the wrong element — re-`peek` and pick again.
47
+ 7. `peek()` — refresh listing; "Paste" / "粘贴" appears in the popover.
48
+ 8. `tap(bbox=<paste-button>)` — paste the app name into the field.
49
+ 9. `peek()` — refresh listing; search results render below the field.
50
+ 10. `tap(bbox=<app-icon>)` — launch the app. (See `<app-icon>` row
51
+ below for decoy warnings.)
52
+
53
+ ## Fixed elements
54
+
55
+ Typical bboxes as priors — per CONVENTION.md, copy verbatim from the
56
+ latest `peek` / `screenshot` listing before tapping.
57
+
58
+ | Name | Typical bbox | How to find it / decoys |
59
+ |---|---|---|
60
+ | `<spotlight-pull>` | `[0.3, 0.4, 0.7, 0.6]` | Mid-screen rectangle. Swipe-down anchor; not a peek target. |
61
+ | `<search-field>` | `[0.11, 0.60, 0.99, 0.66]` | The **focused input** at the BOTTOM, just above the keyboard, y≈0.62. Full-width field with mic icon on right. |
62
+ | `<backspace>` | see PHYSICLAW.md "iPhone keyboard bboxes" | — |
63
+ | `<paste-button>` | `[0.09, 0.56, 0.19, 0.58]` | Row labeled `Paste` / `粘贴` in the pill popover just ABOVE the focused field at y≈0.56. Often next to an `AutoFill` button (same y, x≈0.26-0.38) — pick `Paste`, not `AutoFill`. Dismisses if you tap elsewhere first. |
64
+ | `<app-icon>` | varies — in search results below the field | Row whose label matches the app name **exactly**. Skip rows with App Store badges or "in Safari" / web hits. |
@@ -0,0 +1,117 @@
1
+ ---
2
+ name: phone-setup
3
+ description: Guide user step-by-step to set up their iPhone for PhysiClaw — enable AssistiveTouch, create three iOS Shortcuts (take screenshot, upload latest, clipboard sync).
4
+ allowed-tools: Bash, Read
5
+ ---
6
+
7
+ # iPhone Setup for PhysiClaw
8
+
9
+ Set up three iOS Shortcuts (take screenshot · upload latest · clipboard sync) and bind them to AssistiveTouch taps. The server must be running.
10
+
11
+ ## Step 1: Check `<name>.local` hostname
12
+
13
+ Shortcuts that embed the LAN IP break when DHCP changes it. `<name>.local` survives IP changes on the same Wi-Fi.
14
+
15
+ Check the current name resolves:
16
+
17
+ ```bash
18
+ CUR=$(scutil --get LocalHostName) && echo "$CUR" && ping -c 1 -W 1000 "${CUR}.local"
19
+ ```
20
+
21
+ Use whatever the current name is — do **not** rename automatically. If the user wants a more stable or unique name (e.g. to avoid macOS numeric collision suffixes like `-2`), they can rename it themselves with:
22
+
23
+ ```bash
24
+ sudo scutil --set LocalHostName "new-name" && dscacheutil -flushcache
25
+ ```
26
+
27
+ Shortcut URLs use the lowercase form (`<name>.local`).
28
+
29
+ ## Step 2: Server URLs
30
+
31
+ ```bash
32
+ uv run python -c "
33
+ from physiclaw.core.bridge import bridge_base_urls
34
+ p, f = bridge_base_urls(8048)
35
+ if p != f: print(f'Recommended: {p}')
36
+ print(f'Fallback (IP): {f}')
37
+ "
38
+ ```
39
+
40
+ Use `Recommended` in the Shortcuts. Fallback to the IP only if the network blocks mDNS.
41
+
42
+ ## Step 3: "PhysiClaw Tap" Shortcut (take screenshot)
43
+
44
+ Tell the user:
45
+
46
+ > Shortcuts app → **+**:
47
+ >
48
+ > 1. Add **"Take Screenshot"**
49
+ > 2. Rename to **"PhysiClaw Tap"** → Done
50
+
51
+ Wait for confirmation.
52
+
53
+ ## Step 4: "PhysiClaw Screenshot" Shortcut (upload latest)
54
+
55
+ Tell the user (replace `HOST` with the URL from Step 2):
56
+
57
+ > Shortcuts app → **+**:
58
+ >
59
+ > 1. Add **"Get Latest Screenshots"**
60
+ > 2. Add **"Get Contents of URL"**:
61
+ > - URL: `http://HOST/api/bridge/screenshot`
62
+ > - Show More → Method **POST**, Request Body **File**, File → **Screenshots** variable
63
+ > 3. Rename to **"PhysiClaw Screenshot"** → Done
64
+
65
+ Wait for confirmation.
66
+
67
+ ## Step 5: "PhysiClaw Clipboard" Shortcut (sync clipboard)
68
+
69
+ Tell the user (same `HOST`):
70
+
71
+ > Shortcuts app → **+**:
72
+ >
73
+ > 1. Add **"Get Contents of URL"** → URL `http://HOST/api/bridge/clipboard` (GET)
74
+ > 2. Add **"Copy to Clipboard"** → input = **Contents of URL**
75
+ > 3. Rename to **"PhysiClaw Clipboard"** → Done
76
+
77
+ Wait for confirmation.
78
+
79
+ ## Step 6: AssistiveTouch (skip if configured)
80
+
81
+ Tell the user:
82
+
83
+ > Settings → Accessibility → Touch → **AssistiveTouch ON**. Custom Actions:
84
+ >
85
+ > - **Single-Tap** → Shortcut → PhysiClaw Tap
86
+ > - **Double-Tap** → Shortcut → PhysiClaw Screenshot
87
+ > - **Long Press** → Shortcut → PhysiClaw Clipboard
88
+
89
+ Wait for confirmation.
90
+
91
+ ## Step 7: Test
92
+
93
+ Tell the user:
94
+
95
+ > Tap AssistiveTouch **once** (take screenshot), then **twice** (upload latest).
96
+
97
+ Record the baseline, then poll for a new file:
98
+
99
+ ```bash
100
+ baseline=$(ls -t data/phone/screenshot/ 2>/dev/null | head -1)
101
+ for i in $(seq 1 25); do
102
+ newest=$(ls -t data/phone/screenshot/ 2>/dev/null | head -1)
103
+ [ -n "$newest" ] && [ "$newest" != "$baseline" ] && { echo "✓ $newest"; open "data/phone/screenshot/$newest"; exit 0; }
104
+ sleep 1
105
+ done
106
+ echo "✗ no upload in 25s"
107
+ ```
108
+
109
+ If nothing arrives:
110
+
111
+ - Run the Shortcut manually (Shortcuts app → ▶) — check for errors.
112
+ - From the Mac, `ping "$(scutil --get LocalHostName).local"` — if it fails from the phone's network, mDNS is blocked; swap both Shortcut URLs to the IP fallback from Step 2.
113
+ - Ensure at least one screenshot exists in Photos.
114
+
115
+ ## Done
116
+
117
+ > AssistiveTouch: single tap = take screenshot, double tap = upload latest, long press = clipboard fetch. If a Shortcut breaks after a network change, rerun this setup to get the new URL and paste it into each Shortcut's "Get Contents of URL" step.
@@ -0,0 +1,15 @@
1
+ ---
2
+ name: setup
3
+ description: Connect the robotic arm and camera, then calibrate. Required before using any PhysiClaw MCP tools.
4
+ allowed-tools: Bash
5
+ ---
6
+
7
+ # Setup
8
+
9
+ ```bash
10
+ uv run python scripts/setup.py # interactive, default
11
+ uv run python scripts/setup.py -y # auto mode, skip prompts
12
+ uv run python scripts/setup.py --trace # add edge-trace visual check at end
13
+ ```
14
+
15
+ Fails with non-zero exit and prints which step failed. Fix the physical setup and rerun.
@@ -0,0 +1,38 @@
1
+ ---
2
+ name: setup-vision-models
3
+ description: Download and prepare OmniParser icon detection model for the screenshot() tool. One-time setup — installs temporary deps, converts model, then cleans up.
4
+ allowed-tools: Bash, Read
5
+ ---
6
+
7
+ # Setup Vision Models
8
+
9
+ One-time setup for icon detection and OCR in `screenshot()`. RapidOCR and onnxruntime are already in project dependencies — this only sets up the OmniParser ONNX model.
10
+
11
+ ## Step 1: Check
12
+
13
+ ```bash
14
+ ls data/model/omniparser_icon_detect/model.onnx 2>/dev/null && echo "OK" || echo "MISSING"
15
+ ```
16
+
17
+ If OK, tell the user it's already set up and stop.
18
+
19
+ ## Step 2: Install, convert, clean up
20
+
21
+ ```bash
22
+ uv sync --group convert
23
+ uv run python scripts/download_omniparser.py
24
+ uv sync
25
+ ```
26
+
27
+ ## Step 3: Verify
28
+
29
+ ```bash
30
+ uv run python -c "
31
+ from physiclaw.core.vision.icon_detect import IconDetector
32
+ from physiclaw.core.vision.ocr import OCRReader
33
+ IconDetector(); OCRReader()
34
+ print('OK')
35
+ "
36
+ ```
37
+
38
+ Tell the user setup is complete.
@@ -0,0 +1,100 @@
1
+ ---
2
+ name: wechat
3
+ description: Use when the task involves WeChat / 微信 — reading owner IM, sending a chat reply, finding a contact by name, or any "check messages" / "reply to <name>" request. NOT for Messages / SMS / Telegram / Signal, not for phone calls or FaceTime.
4
+ ---
5
+
6
+ # WeChat (微信)
7
+
8
+ The owner's primary IM.
9
+
10
+ ## Tool choice
11
+
12
+ High-frequency app. Use `peek()` to check chat status — the camera view reliably reads Chinese text bubbles.
13
+
14
+ **Never act on a chat-list preview.** Rows on the Chats tab show only the most recent message per contact, and it's truncated. To read the owner's actual messages, you MUST tap the row and enter the thread, then peek there.
15
+
16
+ ## Flow — read messages
17
+
18
+ 1. If already in the right chat, skip to step 5.
19
+ 2. From Home, tap the WeChat dock icon.
20
+ 3. If not on the **Chats** tab, tap it.
21
+ 4. **Tap the target 1:1 contact's row** — this step is non-skippable. A chat-list preview is truncated and hides earlier messages if the owner sent multiple since your last reply.
22
+ 5. Read the new messages. The thread opens at the bottom; if the topmost visible bubble doesn't connect to your last reply (or looks like a new unrelated message), swipe down on the bubble area to scroll up until you see the first bubble after your last reply.
23
+
24
+ ## Flow — voice message
25
+
26
+ Use the convert-to-text option, then `peek()` the transcript that renders under the bubble.
27
+
28
+ Reply with your reading + planned action and wait for OK before executing. ASR mishears names, amounts, addresses — never act on a voice instruction unconfirmed.
29
+
30
+ ## Flow — send a message
31
+
32
+ Two states: **keyboard hidden** (input bar at bottom) and **keyboard visible** (shifted up, send key on keyboard).
33
+
34
+ 1. Confirm right contact in the chat header.
35
+ 2. If keyboard is hidden, tap the input box (keyboard-hidden bbox) — the keyboard opens and the input shifts to the keyboard-visible row.
36
+ 3. If the input has stale text, tap **backspace** until empty.
37
+ 4. `send_to_clipboard(text)`, then long-press the input box (keyboard-visible bbox).
38
+ 5. Tap **Paste** in the popup.
39
+ 6. Tap the keyboard's **send** key — **not** the (+) on the input bar.
40
+ 7. Hide the keyboard (see Hide keyboard below).
41
+ 8. Confirm the bubble appeared in the chat.
42
+ 9. Tap the back arrow `<` (top-left) to return to the Chats list — leaves WeChat in its main state for the next wake.
43
+
44
+ ### Fast path — `sequence` (5 steps)
45
+
46
+ When you're already on the right 1:1 chat, the input is clean, and the keyboard is hidden, collapse steps 2–6 into one `sequence` call:
47
+
48
+ ```python
49
+ sequence(
50
+ step1 = {"tool_name": "tap", "arg": [0.100, 0.910, 0.700, 0.960]},
51
+ step2 = {"tool_name": "send_to_clipboard", "arg": "<your text>"},
52
+ step3 = {"tool_name": "long_press", "arg": [0.100, 0.575, 0.700, 0.625]},
53
+ step4 = {"tool_name": "tap", "arg": [0.050, 0.530, 0.220, 0.570]},
54
+ step5 = {"tool_name": "tap", "arg": [0.752, 0.864, 0.992, 0.917]},
55
+ )
56
+ ```
57
+
58
+ After the sequence: hide keyboard, confirm bubble, tap back arrow (steps 7–9 above).
59
+
60
+ ## Fixed elements
61
+
62
+ Bboxes `[left, top, right, bottom]` in 0-1 screen coords. Re-peek if a row looks off — banners shift the layout.
63
+
64
+ **WeChat dock icon (Home Screen):** `[0.294, 0.891, 0.474, 0.967]`
65
+
66
+ ### Bottom nav
67
+
68
+ | Tab | Bbox |
69
+ | ----------------- | ------------------------------- |
70
+ | Chats (微信) | `[0.070, 0.945, 0.190, 0.965]` |
71
+ | Contacts (通讯录) | `[0.324, 0.947, 0.437, 0.962]` |
72
+ | Discover (发现) | `[0.578, 0.947, 0.684, 0.963]` |
73
+ | Me (我) | `[0.840, 0.945, 0.940, 0.965]` |
74
+
75
+ ### Chats list rows
76
+
77
+ Each row is **0.08 tall**. Derive the Nth row from the 1st by adding `0.08 × (N-1)` to the y values.
78
+
79
+ | Row | Bbox |
80
+ | --- | ------------------------------- |
81
+ | 1st | `[0.160, 0.180, 0.940, 0.260]` |
82
+ | 2nd | `[0.160, 0.260, 0.940, 0.340]` |
83
+
84
+ ### Chat page
85
+
86
+ | Element | When | Bbox |
87
+ | --------------- | ------------------- | ------------------------------- |
88
+ | Back arrow `<` | always | `[0.016, 0.066, 0.082, 0.102]` |
89
+ | Contact name | always | `[0.400, 0.060, 0.600, 0.100]` |
90
+ | Text input area | keyboard hidden | `[0.100, 0.910, 0.700, 0.960]` |
91
+ | Text input area | keyboard visible | `[0.100, 0.575, 0.700, 0.625]` |
92
+ | Paste button | long-pressing input | `[0.050, 0.530, 0.220, 0.570]` |
93
+
94
+ The send key on the keyboard and the backspace key are standard iPhone keyboard positions — see PHYSICLAW.md "iPhone keyboard bboxes".
95
+
96
+ ### Hide keyboard
97
+
98
+ A small upward swipe in the empty chat scroll area dismisses the keyboard without scrolling the just-sent bubble off-screen.
99
+
100
+ `swipe(bbox=[0.300, 0.300, 0.700, 0.500], direction="up", size="s")`