browser-automation-skill 0.71.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +144 -0
  3. package/SECURITY.md +39 -0
  4. package/SKILL.md +206 -0
  5. package/bin/cli.mjs +55 -0
  6. package/install.sh +143 -0
  7. package/package.json +54 -0
  8. package/references/adapter-candidates.md +40 -0
  9. package/references/browser-mcp-cheatsheet.md +132 -0
  10. package/references/browser-stats-cheatsheet.md +155 -0
  11. package/references/chrome-devtools-mcp-cheatsheet.md +232 -0
  12. package/references/midscene-integration.md +359 -0
  13. package/references/obscura-cheatsheet.md +103 -0
  14. package/references/playwright-cli-cheatsheet.md +64 -0
  15. package/references/playwright-lib-cheatsheet.md +90 -0
  16. package/references/recipes/add-a-tool-adapter.md +134 -0
  17. package/references/recipes/agent-workflows/README.md +37 -0
  18. package/references/recipes/agent-workflows/cache-driven-bulk-operation.md +110 -0
  19. package/references/recipes/agent-workflows/flow-record-and-replay.md +102 -0
  20. package/references/recipes/agent-workflows/incremental-pattern-discovery.md +125 -0
  21. package/references/recipes/agent-workflows/login-then-scrape.md +100 -0
  22. package/references/recipes/anti-patterns-tool-extension.md +182 -0
  23. package/references/recipes/body-bytes-not-body.md +139 -0
  24. package/references/recipes/cache-write-security.md +210 -0
  25. package/references/recipes/fingerprint-rescue.md +154 -0
  26. package/references/recipes/model-routing.md +143 -0
  27. package/references/recipes/path-security.md +138 -0
  28. package/references/recipes/privacy-canary.md +96 -0
  29. package/references/recipes/visual-rescue-hook.md +182 -0
  30. package/references/stats-prices.json +42 -0
  31. package/references/stats-schema.json +77 -0
  32. package/references/tool-versions.md +8 -0
  33. package/scripts/browser-add-site.sh +113 -0
  34. package/scripts/browser-assert.sh +106 -0
  35. package/scripts/browser-audit.sh +68 -0
  36. package/scripts/browser-baseline.sh +135 -0
  37. package/scripts/browser-click.sh +100 -0
  38. package/scripts/browser-creds-add.sh +254 -0
  39. package/scripts/browser-creds-list.sh +67 -0
  40. package/scripts/browser-creds-migrate.sh +122 -0
  41. package/scripts/browser-creds-remove.sh +69 -0
  42. package/scripts/browser-creds-rotate-totp.sh +109 -0
  43. package/scripts/browser-creds-show.sh +82 -0
  44. package/scripts/browser-creds-totp.sh +94 -0
  45. package/scripts/browser-do.sh +630 -0
  46. package/scripts/browser-doctor.sh +365 -0
  47. package/scripts/browser-drag.sh +90 -0
  48. package/scripts/browser-extract.sh +192 -0
  49. package/scripts/browser-fill.sh +142 -0
  50. package/scripts/browser-flow.sh +316 -0
  51. package/scripts/browser-history.sh +187 -0
  52. package/scripts/browser-hover.sh +92 -0
  53. package/scripts/browser-inspect.sh +188 -0
  54. package/scripts/browser-list-sessions.sh +78 -0
  55. package/scripts/browser-list-sites.sh +42 -0
  56. package/scripts/browser-login.sh +279 -0
  57. package/scripts/browser-mcp.sh +65 -0
  58. package/scripts/browser-migrate.sh +195 -0
  59. package/scripts/browser-open.sh +134 -0
  60. package/scripts/browser-press.sh +80 -0
  61. package/scripts/browser-remove-session.sh +72 -0
  62. package/scripts/browser-remove-site.sh +68 -0
  63. package/scripts/browser-replay.sh +206 -0
  64. package/scripts/browser-route.sh +174 -0
  65. package/scripts/browser-select.sh +122 -0
  66. package/scripts/browser-show-session.sh +57 -0
  67. package/scripts/browser-show-site.sh +37 -0
  68. package/scripts/browser-snapshot.sh +176 -0
  69. package/scripts/browser-stats.sh +522 -0
  70. package/scripts/browser-tab-close.sh +112 -0
  71. package/scripts/browser-tab-list.sh +70 -0
  72. package/scripts/browser-tab-switch.sh +111 -0
  73. package/scripts/browser-upload.sh +132 -0
  74. package/scripts/browser-use.sh +60 -0
  75. package/scripts/browser-vlm.sh +707 -0
  76. package/scripts/browser-wait.sh +97 -0
  77. package/scripts/install-git-hooks.sh +16 -0
  78. package/scripts/lib/capture.sh +356 -0
  79. package/scripts/lib/common.sh +262 -0
  80. package/scripts/lib/credential.sh +237 -0
  81. package/scripts/lib/fingerprint-rescue.js +123 -0
  82. package/scripts/lib/flow.sh +448 -0
  83. package/scripts/lib/flow_record.sh +210 -0
  84. package/scripts/lib/mask.sh +49 -0
  85. package/scripts/lib/memory.sh +427 -0
  86. package/scripts/lib/migrate.sh +390 -0
  87. package/scripts/lib/migrators/README.md +23 -0
  88. package/scripts/lib/migrators/memory/v1_to_v2.sh +15 -0
  89. package/scripts/lib/migrators/recent_urls/README.md +13 -0
  90. package/scripts/lib/migrators/stats/README.md +24 -0
  91. package/scripts/lib/node/chrome-devtools-bridge.mjs +1812 -0
  92. package/scripts/lib/node/mcp-server.mjs +531 -0
  93. package/scripts/lib/node/mcp-tools.json +68 -0
  94. package/scripts/lib/node/playwright-driver.mjs +1104 -0
  95. package/scripts/lib/node/totp-core.mjs +52 -0
  96. package/scripts/lib/node/totp.mjs +52 -0
  97. package/scripts/lib/node/url-pattern-cluster.mjs +102 -0
  98. package/scripts/lib/node/url-pattern-resolver.mjs +77 -0
  99. package/scripts/lib/output.sh +79 -0
  100. package/scripts/lib/router.sh +342 -0
  101. package/scripts/lib/sanitize.sh +107 -0
  102. package/scripts/lib/secret/keychain.sh +91 -0
  103. package/scripts/lib/secret/libsecret.sh +74 -0
  104. package/scripts/lib/secret/plaintext.sh +75 -0
  105. package/scripts/lib/secret_backend_select.sh +57 -0
  106. package/scripts/lib/session.sh +153 -0
  107. package/scripts/lib/site.sh +126 -0
  108. package/scripts/lib/stats.sh +419 -0
  109. package/scripts/lib/tool/.gitkeep +0 -0
  110. package/scripts/lib/tool/chrome-devtools-mcp.sh +349 -0
  111. package/scripts/lib/tool/obscura.sh +249 -0
  112. package/scripts/lib/tool/playwright-cli.sh +155 -0
  113. package/scripts/lib/tool/playwright-lib.sh +106 -0
  114. package/scripts/lib/verb_helpers.sh +222 -0
  115. package/scripts/lib/visual-rescue-default.sh +145 -0
  116. package/scripts/regenerate-docs.sh +99 -0
  117. package/uninstall.sh +51 -0
@@ -0,0 +1,359 @@
1
+ # midscene integration — design notes
2
+
3
+ `web-infra-dev/midscene` (MIT, 13k stars, updated 2026-05-20) is a vision-driven
4
+ UI automation framework: it grounds element selection in screenshots
5
+ processed by a visual-language model (VLM), not in DOM accessibility refs.
6
+ This file collects what we learned from the docs and proposes three
7
+ concrete integration paths into `browser-automation-skill`. No code in this
8
+ file — just design with verbatim citations and proposed env-var blocks.
9
+
10
+ ## Why this matters
11
+
12
+ | Dimension | Our skill (today) | midscene |
13
+ |---|---|---|
14
+ | Element locator | `eN` accessibility refs (text-encoded) | screenshot coordinates (vision) |
15
+ | Per-action token cost | ~400 tok / page snapshot, 0 / cached action | ~1000 tok / screenshot, every action |
16
+ | Repeat-action cost | 0 via `browser-do` intent cache | full re-cost (cache is prompt-keyed, not intent-keyed) |
17
+ | Handles `<canvas>` / pixel-only UIs | NO | YES |
18
+ | Handles Android / iOS / desktop apps | NO | YES |
19
+ | Local-model option | n/a (Claude does grounding) | YES — Qwen3-VL / UI-TARS via OpenAI-compatible endpoint |
20
+
21
+ The paradigms are **complementary, not competing**. For DOM apps with
22
+ stable accessibility trees, our text refs win on token cost. For canvas,
23
+ mobile, PDF embeds, and any DOM-opaque UI, midscene's pixel grounding is
24
+ the only path. The integration plan below lets users pick the right tool
25
+ per intent without committing to one stack.
26
+
27
+ ## Midscene cache mechanics (verbatim)
28
+
29
+ From <https://midscenejs.com/caching>:
30
+
31
+ > "Midscene uses the prompt instruction as the cache key to store the
32
+ > execution plan returned by AI"
33
+ >
34
+ > "the system uses the location prompt as the cache key to store element
35
+ > XPath information"
36
+ >
37
+ > "Cache contents will be saved in the `./midscene_run/cache` directory with
38
+ > the `.cache.yaml` as the extension name"
39
+ >
40
+ > Invalidation triggers:
41
+ > 1. "The text content of the new element at the same XPath is different
42
+ > from the cached element"
43
+ > 2. "The DOM structure of the page is changed from the cached one"
44
+ >
45
+ > "query results like aiBoolean, aiQuery, aiAssert will never be cached"
46
+ >
47
+ > "XPath caching explicitly excludes Canvas, cross-origin iframes, closed
48
+ > Shadow DOM, and dynamically generated graphics"
49
+
50
+ How this maps onto our `browser-do` cache: both are intent-keyed (prompt
51
+ text → selector). Theirs invalidates on DOM diff; ours on Phase 13
52
+ fingerprint mismatch. They are **structurally equivalent caches with
53
+ different invalidation triggers** — see "Integration path 3" below for the
54
+ composition.
55
+
56
+ ## Midscene API surface (verbatim)
57
+
58
+ From <https://midscenejs.com/api>:
59
+
60
+ > **Auto Planning**: `agent.aiAct()` / `agent.ai()` — Midscene
61
+ > automatically decomposes tasks into steps via AI model planning, then
62
+ > executes them sequentially.
63
+ >
64
+ > **Instant Actions**: `agent.aiTap()`, `agent.aiHover()`,
65
+ > `agent.aiInput()`, `agent.aiScroll()`, `agent.aiPinch()`,
66
+ > `agent.aiLongPress()`, `agent.aiDoubleClick()`, `agent.aiRightClick()` —
67
+ > The model locates elements while actions are predefined.
68
+ >
69
+ > **Data Extraction**: `agent.aiQuery()`, `agent.aiAsk()`,
70
+ > `agent.aiBoolean()`, `agent.aiNumber()`, `agent.aiString()`
71
+ >
72
+ > **Assertions & Waiting**: `agent.aiAssert()`, `agent.aiWaitFor()`,
73
+ > `agent.aiLocate()`
74
+
75
+ Screenshots are sent as base64 in the OpenAI-compatible payload.
76
+ `screenshotShrinkFactor` scales image dimensions before transmission.
77
+
78
+ ## Midscene MCP servers (verbatim)
79
+
80
+ From <https://midscenejs.com/mcp>: four MCP-server packages, one per
81
+ platform:
82
+
83
+ ```
84
+ npx -y @midscene/web-bridge-mcp
85
+ npx -y @midscene/ios-mcp
86
+ npx -y @midscene/android-mcp
87
+ npx -y @midscene/computer-mcp
88
+ ```
89
+
90
+ Tool surface (per category): `web_connect`, `ios_connect`,
91
+ `android_connect`, `computer_connect`, `take_screenshot`, `assert`, plus
92
+ the Action Space verbs (`Tap`, `Scroll`, etc.). Detailed input schemas not
93
+ published on the docs page.
94
+
95
+ ## Model wiring — concrete env-var blocks
96
+
97
+ Midscene 2026 conventions (from
98
+ <https://midscenejs.com/model-common-config>):
99
+
100
+ ```bash
101
+ # Qwen3-VL via Alibaba DashScope (cloud, verified by midscene docs)
102
+ export MIDSCENE_MODEL_BASE_URL="https://dashscope.aliyuncs.com/compatible-mode/v1"
103
+ export MIDSCENE_MODEL_API_KEY="<your-dashscope-key>"
104
+ export MIDSCENE_MODEL_NAME="qwen3-vl-plus"
105
+ export MIDSCENE_MODEL_FAMILY="qwen3-vl"
106
+
107
+ # UI-TARS via Volcano Engine (cloud, verified)
108
+ export MIDSCENE_MODEL_BASE_URL="https://ark.cn-beijing.volces.com/api/v3"
109
+ export MIDSCENE_MODEL_API_KEY="<your-volces-key>"
110
+ export MIDSCENE_MODEL_NAME="ep-2025..."
111
+ export MIDSCENE_MODEL_FAMILY="vlm-ui-tars-doubao-1.5"
112
+
113
+ # GPT-5.4 (cloud, verified)
114
+ export MIDSCENE_MODEL_BASE_URL="https://api.openai.com/v1"
115
+ export MIDSCENE_MODEL_API_KEY="sk-..."
116
+ export MIDSCENE_MODEL_NAME="gpt-5.4"
117
+ export MIDSCENE_MODEL_FAMILY="gpt-5"
118
+ ```
119
+
120
+ ### Qwen3-VL via llama.cpp local (NOT in midscene docs — composed from
121
+ ### OpenAI-compatible standard)
122
+
123
+ `llama-server` exposes an OpenAI-compatible API at
124
+ `http://127.0.0.1:8080/v1/chat/completions`. The `MIDSCENE_MODEL_*` block
125
+ follows:
126
+
127
+ ```bash
128
+ # Local llama.cpp serving Qwen3-VL-4B-Instruct (q4_K_M)
129
+ export MIDSCENE_MODEL_BASE_URL="http://127.0.0.1:8080/v1"
130
+ export MIDSCENE_MODEL_API_KEY="local" # llama-server ignores the key
131
+ export MIDSCENE_MODEL_NAME="Qwen3-VL-4B-Instruct"
132
+ export MIDSCENE_MODEL_FAMILY="qwen3-vl"
133
+ ```
134
+
135
+ Launch command (verified working in `references/midscene-integration.md`
136
+ acceptance run — see "Local stack" below):
137
+
138
+ ```bash
139
+ llama-server -hf Qwen/Qwen3-VL-4B-Instruct-GGUF:Q4_K_M \
140
+ --host 127.0.0.1 --port 8080
141
+ ```
142
+
143
+ The `-hf` flag auto-downloads both the main `.gguf` AND the matching
144
+ `mmproj-*.gguf` (vision projector). First-run download ≈ 3.5 GB to
145
+ `~/Library/Caches/llama.cpp/` (macOS) or `~/.cache/llama.cpp/` (Linux).
146
+
147
+ ### Local-stack disk + memory budget
148
+
149
+ | Model | Disk (q4_K_M) | Runtime unified-memory | Suitable Apple Silicon |
150
+ |---|---|---|---|
151
+ | Qwen3-VL-2B | ~2 GB | ~3 GB | M1+ 8 GB |
152
+ | Qwen3-VL-4B | ~3.5 GB | ~5 GB | M1+ 16 GB |
153
+ | Qwen3-VL-8B | ~6.5 GB | ~8 GB | M2+ 16 GB |
154
+ | Qwen3-VL-30B (MoE A3B) | ~22 GB | ~24 GB | M3 Max / M4 Max / M3 Ultra |
155
+ | UI-TARS-1.5-7B | ~4 GB | ~6 GB | M2+ 16 GB (browser-task-tuned) |
156
+
157
+ ## Integration paths into browser-automation-skill
158
+
159
+ ### Path 1 — Add `midscene-bridge` as 5th adapter (NARROW SCOPE)
160
+
161
+ Routed only when `--vision-only` flag set OR when other adapters return
162
+ `EXIT_TOOL_UNSUPPORTED_OP` because target is in a canvas / mobile-only
163
+ UI. Implementation: thin bash wrapper around `npx @midscene/web-bridge-mcp`
164
+ that translates our verb argv → midscene MCP `tools/call`.
165
+
166
+ Triggers (per `references/adapter-candidates.md` template):
167
+ - Target element is inside `<canvas>` → no other adapter can grab it
168
+ - User passes `--vision-only` explicitly
169
+ - User has Qwen3-VL/UI-TARS running locally OR cloud creds set
170
+
171
+ Why not as default: vision adapter costs ~1000 tok/action vs our cached
172
+ 0 tok. Only use when the cheaper paths don't apply.
173
+
174
+ ### Path 2 — Local-model env-var passthrough in our MCP server
175
+
176
+ Our Stage-1 MCP server (`scripts/lib/node/mcp-server.mjs`, shipped in
177
+ `67fd4a1`) currently shells to bash verb scripts. Extend it so the
178
+ `MIDSCENE_MODEL_*` block — when set in the client's env — passes through
179
+ to spawned children. Then clients that also speak to midscene see one
180
+ consistent local-LLM endpoint regardless of which skill they're calling.
181
+
182
+ Effort: ~5 LOC in `mcp-server.mjs`'s `spawn` call. No new dependency.
183
+
184
+ ### Path 3 — `browser-do` cache enrichment via local VLM (HIGH LEVERAGE)
185
+
186
+ Our intent-keyed cache (`scripts/browser-do.sh`) currently does:
187
+
188
+ ```
189
+ intent → archetype-id → cached selector → click
190
+ ↓ (miss / stale)
191
+ LLM round-trip on host (Claude)
192
+ ```
193
+
194
+ Proposal: insert a local VLM probe BEFORE the LLM fallback:
195
+
196
+ ```
197
+ intent → archetype-id → cached selector
198
+ ↓ (Phase 13 fingerprint diff)
199
+ local Qwen3-VL "is element at [bbox] still <intent>?"
200
+ ↓ yes ↓ no
201
+ keep cache LLM round-trip on host
202
+ ```
203
+
204
+ Effect: when a cached selector goes stale due to a cosmetic DOM diff
205
+ (common: tooltip wrapper changes, ARIA attribute added), the local VLM
206
+ confirms visual identity in ~200ms on an M3 Pro w/ Qwen3-VL-4B. Zero
207
+ cloud tokens. Cache survives.
208
+
209
+ When the page genuinely changed (button moved, new flow), local VLM says
210
+ no → fall through to existing LLM round-trip → normal Phase 13 rescue
211
+ applies.
212
+
213
+ This is the biggest token saver of the three because it intercepts the
214
+ hot path: cache-near-miss is the most common cache failure mode in real
215
+ telemetry.
216
+
217
+ ## What we shipped today (Phase 14) that enables this
218
+
219
+ Three commits landed before this design doc:
220
+
221
+ | Commit | Why this matters for midscene integration |
222
+ |---|---|
223
+ | `763c86c` Phase 14 A/B/C | `oblivious_success` events now fire on URL mismatch (Phase B). Means we can A/B "midscene-vision vs. cached selector" choices using stats.jsonl as the scoreboard, not anecdote. |
224
+ | `67fd4a1` MCP Stage 1 | Our verbs are now MCP-callable. Midscene's MCP server can call our `browser_snapshot` to get a cheap text ref before deciding whether vision grounding is needed. |
225
+ | `149a7d1` capture-flake fix | Full suite back to 1028/1028; future Path-3 work can ship behind RED-GREEN bats without an existing failure masking new regressions. |
226
+
227
+ ## Acceptance: local-stack smoke test (status: measured 2026-05-20)
228
+
229
+ Environment:
230
+
231
+ - Hardware: Apple M3 Pro, 36 GB unified memory, macOS 25.5 (Darwin)
232
+ - llama.cpp: brew bottle 9200 (`3e12fbdea`), ARM64 native (`/opt/homebrew/bin/llama-server`)
233
+ - Model: `Qwen/Qwen3-VL-4B-Instruct-GGUF:Q4_K_M` (4.02B params, 175K ctx slot, 2.49 GB resident; auto-downloaded with mmproj to `~/.cache/huggingface/hub/models--Qwen--Qwen3-VL-4B-Instruct-GGUF/`, total 2.8 GB on disk)
234
+ - Endpoint: `http://127.0.0.1:8080/v1/chat/completions` (OpenAI-compatible)
235
+ - Launch (FAT — defaults; wasteful for single-user): `llama-server -hf Qwen/Qwen3-VL-4B-Instruct-GGUF:Q4_K_M --host 127.0.0.1 --port 8080`
236
+ - Launch (**LEAN, recommended** — single-user single-skill on M-series): see "Lean launch" block below
237
+
238
+ ### Two measured runs (same hardware, same model)
239
+
240
+ Each row = same prompt against the same model on the same Mac. FAT run
241
+ takes server defaults (parallel=4, ctx=175616, threads=all-P-cores,
242
+ cache-ram=8192 MiB). LEAN run uses the bounded flags from the next
243
+ section.
244
+
245
+ | Smoke | FAT lat | LEAN lat | Speedup | FAT prompt / pred tok/s | LEAN prompt / pred tok/s | LEAN completion |
246
+ |---|---:|---:|---:|---:|---:|---|
247
+ | **1. Text (cold)** | 5.16 s | **0.28 s** | **18×** | 5.55 / 1.05 | 74.55 / 51.17 | `"Hello"` ✓ |
248
+ | **2. Vision (red PNG)** | 15.09 s | **0.47 s** | **32×** | 2.55 / 4.56 | 75.97 / 58.17 | `"blue"` ✗ (still wrong — quant-bound) |
249
+ | **3. Vision (green PNG)** | 15.56 s | **0.43 s** | **36×** | 1.76 / 2.78 | 70.74 / 58.25 | `"Green"` ✓ |
250
+ | **4. Text (warm)** | 1.88 s | **0.25 s** | **7.5×** | 12.64 / 7.80 | 91.88 / 41.30 | `"No reply."` ✓ |
251
+
252
+ Resident RAM (peak `ps -o rss` on the llama-server child): LEAN **3.99 GB** measured. FAT not directly measured but allocates ~4× the KV cache (175616 ctx × 4 slots vs 8192 × 1 slot ≈ 86× theoretical KV-buffer ratio); the prompt-cache cap dropped from 8192 MiB → 512 MiB independently.
253
+
254
+ ### Path-3 unblock: `browser-vlm bench`
255
+
256
+ The smoke battery above ran against ONE model. To answer "which model unblocks
257
+ Path 3?" empirically, use `bench`:
258
+
259
+ ```bash
260
+ bash scripts/browser-vlm.sh bench # default 3-model list
261
+ bash scripts/browser-vlm.sh bench --dry-run # print plan, no spawns
262
+ bash scripts/browser-vlm.sh bench \
263
+ Qwen/Qwen3-VL-4B-Instruct-GGUF:Q8_0 \
264
+ Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M # custom list
265
+ ```
266
+
267
+ Each model: start → wait `/health` → 4-smoke battery → stop. Output is one
268
+ NDJSON line per model + a final `bench-done` aggregate. Path 3 is unblocked
269
+ when a model reports `pass:4 fail:0` on the smoke battery (both vision color
270
+ identifications correct).
271
+
272
+ Bench default list (covers the three best Path-3 candidates):
273
+
274
+ | Model | Disk (~) | Why |
275
+ |---|---|---|
276
+ | `Qwen/Qwen3-VL-4B-Instruct-GGUF:Q4_K_M` | 3.5 GB | baseline (the one we measured red→"blue" miss) |
277
+ | `Qwen/Qwen3-VL-4B-Instruct-GGUF:Q8_0` | 4.3 GB | same params, less quantization — fast test if quant was the issue |
278
+ | `Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M` | 6.5 GB | midscene's recommended default |
279
+
280
+ First-time `bench` runs incur the GGUF + mmproj download cost per model
281
+ (landed at `~/.cache/huggingface/hub/`); subsequent runs against the same
282
+ model start in ~5 s.
283
+
284
+ ### Lean launch (recommended default)
285
+
286
+ ```bash
287
+ llama-server -hf Qwen/Qwen3-VL-4B-Instruct-GGUF:Q4_K_M \
288
+ --host 127.0.0.1 --port 8080 \
289
+ --ctx-size 8192 \
290
+ --parallel 1 \
291
+ --threads 4 \
292
+ --threads-batch 6 \
293
+ --cache-ram 512 \
294
+ --n-gpu-layers 99
295
+ ```
296
+
297
+ Why each flag:
298
+
299
+ | Flag | Default | Lean | Why |
300
+ |---|---|---|---|
301
+ | `--ctx-size` | 175616 | 8192 | KV cache scales linear; 8K is enough for any single browser-grounding prompt |
302
+ | `--parallel` | 4 | 1 | each slot reserves its own KV cache; single-user → single slot |
303
+ | `--threads` | all P-cores | 4 | bounds generation-thread CPU footprint; leaves UI/agent responsive |
304
+ | `--threads-batch` | = `--threads` | 6 | lets prompt-eval (compute-bound) use more cores than generation (memory-bound) |
305
+ | `--cache-ram` | 8192 (MiB) | 512 | cap cross-request prompt cache; 512 MiB is enough for a few repeated turns |
306
+ | `--n-gpu-layers` | 99 (macOS default) | 99 | explicit; ensures all transformer layers offload to Metal GPU |
307
+
308
+ **Implications for the integration paths above:**
309
+
310
+ - **Pipeline works end-to-end.** OpenAI-compatible chat completions ✓, image_url base64 ingestion ✓, mmproj auto-loaded ✓.
311
+ - **Vision accuracy at 4B q4_K_M is borderline** (1/2 primary-color identifications wrong on identical-protocol calls — same outcome in both FAT and LEAN runs, so the misclassification is the QUANTIZATION talking, not config). For **Path 3 (cache-rescue visual confirmation)** this would generate false-negatives → DON'T wire 4B-q4_K_M into the cache hot path. Either:
312
+ - **Qwen3-VL-8B q4_K_M** (~6.5 GB) — recommended by midscene
313
+ - **Qwen3-VL-4B q8_0** (~4.3 GB) — higher fidelity at smaller size
314
+ - **UI-TARS-1.5-7B** (~4 GB) — explicitly post-trained on UI grounding
315
+ - **LEAN config makes the local stack viable.** FAT vision-call cost was ~15 s; LEAN is ~0.45 s. That changes the Path 3 cost frame: **a cache-rescue visual probe at <500 ms is now competitive with — and often cheaper than — a cloud LLM round-trip** (~1 s + token cost). The model-accuracy gap above is now the only blocker; bump the model and Path 3 becomes the highest-ROI integration.
316
+ - **`Failed to load image or audio file` error** appears if the `data:image/png;base64,…` URL is malformed (e.g. embedded newlines in base64) — strip newlines with `tr -d '\n'` before constructing the data URL.
317
+
318
+ ### Caveat — the FAT-vs-LEAN speedup is partially mmap-warmth, not pure config
319
+
320
+ The LEAN run executed after the FAT run on the same machine. Model
321
+ weights were already in the macOS filesystem cache (mmap'd from disk),
322
+ so LEAN paid no I/O-warmup cost. A true cold-disk LEAN run would be
323
+ slower than 0.28 s on Smoke 1 — closer to 1–2 s for the initial weight
324
+ read. Subsequent calls within the same process would still hit the
325
+ numbers in the LEAN column. The win that IS purely config:
326
+ - single slot vs four = no KV-cache duplication
327
+ - smaller ctx-size = smaller per-token attention matmul
328
+ - bounded thread count = no thermal throttling on M3 Pro's perf cores
329
+
330
+ ## What NOT to do
331
+
332
+ - **Don't replace our a11y backbone with vision.** Token math fails for
333
+ DOM apps (see table above).
334
+ - **Don't bundle Qwen3-VL into `install.sh`.** Even 4B q4_K_M is 3.5 GB
335
+ download. `doctor` should advisory-check `llama-server --version` +
336
+ HTTP ping `/health` and report "ok: local VLM ready" or "warn: local
337
+ VLM not running — see references/midscene-integration.md".
338
+ - **Don't call midscene MCP server inline from a bash verb.** It's a
339
+ long-lived daemon. Treat it like our Phase-5 `daemon-start` model:
340
+ user launches once, verbs reuse the loopback endpoint.
341
+ - **Don't auto-fall-through to vision on every cache miss.** Gate behind
342
+ an env var (e.g. `BROWSER_SKILL_VISION_FALLBACK=1`) until we have
343
+ stats showing the VLM probe actually saves cloud tokens net of its
344
+ local-compute cost.
345
+
346
+ ## References
347
+
348
+ - midscene introduction — <https://midscenejs.com/introduction>
349
+ - midscene caching — <https://midscenejs.com/caching>
350
+ - midscene API — <https://midscenejs.com/api>
351
+ - midscene MCP — <https://midscenejs.com/mcp>
352
+ - midscene model config — <https://midscenejs.com/model-common-config>
353
+ - llama.cpp multimodal — <https://github.com/ggml-org/llama.cpp/blob/master/docs/multimodal.md>
354
+ - Qwen3-VL-4B-Instruct GGUF — <https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct-GGUF>
355
+ - Our cache (browser-do) — `scripts/browser-do.sh` + spec
356
+ `docs/superpowers/specs/2026-05-08-phase-11-memory-design.md`
357
+ - Our MCP server — `scripts/lib/node/mcp-server.mjs` + cheatsheet
358
+ `references/browser-mcp-cheatsheet.md`
359
+ - Phase 14 commits — `763c86c`, `67fd4a1`, `149a7d1`
@@ -0,0 +1,103 @@
1
+ # obscura — cheatsheet
2
+
3
+ The browser-skill's obscura adapter shells to the [`obscura`](https://github.com/h4ckf0r0day/obscura) binary (Apache-2.0 Rust headless browser, ~70 MB). Obscura is the default-routed adapter for `--scrape` and `--stealth` (since Phase 8 part 2-i); reachable as `--tool obscura` for explicit override.
4
+
5
+ ## When the router picks this adapter
6
+
7
+ | Verb | Default? |
8
+ |---|---|
9
+ | `extract --scrape <urls...>` | **yes** (since 8-2-i) — `rule_scrape_flag` in `scripts/lib/router.sh` |
10
+ | `extract --stealth <url>` | **yes** (since 8-2-i) — `rule_stealth_flag` in `scripts/lib/router.sh` |
11
+ | any other verb | no — obscura is a one-shot fetch/scrape adapter |
12
+
13
+ `--tool obscura` still works as an explicit override. `--tool chrome-devtools-mcp --scrape` would fail the capability filter (cdt-mcp doesn't declare `--scrape`) and the router falls through to the next rule.
14
+
15
+ ## Capabilities declared
16
+
17
+ ```json
18
+ {
19
+ "verbs": {
20
+ "extract": { "flags": ["--scrape", "--stealth", "--eval", "--selector"] }
21
+ }
22
+ }
23
+ ```
24
+
25
+ Stateful verbs (`open`, `click`, `fill`, `snapshot`) are intentionally **not** declared. Obscura is a one-shot fetch/scrape engine; stateful navigation belongs to `playwright-cli` / `playwright-lib` / `chrome-devtools-mcp`.
26
+
27
+ ## Doctor check
28
+
29
+ The adapter checks for the `obscura` binary on PATH. To install:
30
+
31
+ ```bash
32
+ # Linux x86_64
33
+ curl -LO https://github.com/h4ckf0r0day/obscura/releases/latest/download/obscura-x86_64-linux.tar.gz
34
+ tar xzf obscura-x86_64-linux.tar.gz
35
+
36
+ # macOS Apple Silicon
37
+ curl -LO https://github.com/h4ckf0r0day/obscura/releases/latest/download/obscura-aarch64-macos.tar.gz
38
+ tar xzf obscura-aarch64-macos.tar.gz
39
+ ```
40
+
41
+ No Chrome, no Node.js, no dependencies. Release archives include both `obscura` and `obscura-worker`; keep them in the same directory for the parallel `scrape` command.
42
+
43
+ ## Version pin
44
+
45
+ - `version_pin: "0.x"` — pre-1.0 upstream; major.minor stability target tracks the latest release available at adapter-roll time.
46
+
47
+ ## Override
48
+
49
+ To force this adapter even when the router would pick another (e.g. for verbs where obscura isn't the default):
50
+
51
+ ```bash
52
+ bash scripts/browser-extract.sh --tool obscura ...
53
+ ```
54
+
55
+ This was the **Path A entry point** in 8-1-i / 8-1-ii / 8-1-iii. After 8-2-i, `--scrape` and `--stealth` auto-route to obscura (Path B); the explicit `--tool obscura` is no longer needed for those flags but still works as an override.
56
+
57
+ ## Modes
58
+
59
+ Obscura ships in two modes upstream. The adapter targets only mode 1.
60
+
61
+ | Mode | Surface | Handled by |
62
+ |---|---|---|
63
+ | **One-shot** | `obscura fetch <url>` + `obscura scrape <urls...>` | this adapter (8-1-ii / 8-1-iii) |
64
+ | **CDP server** | `obscura serve --port 9222` (Puppeteer/Playwright connect via `connectOverCDP`) | future `playwright-lib --cdp-endpoint` flag — NOT this adapter |
65
+
66
+ Reasoning: mode 2 overlaps with `playwright-lib`'s transport. Adding it as a separate adapter would split the contributor mental model. The unique-lane principle picks mode 1 only.
67
+
68
+ ## Limitations
69
+
70
+ - **No stateful navigation** — no persistent page; each invocation is a fresh process. Use `playwright-cli` / `playwright-lib` / `chrome-devtools-mcp` for click/fill/snapshot flows.
71
+ - **No console-message or network-HAR capture** — use `chrome-devtools-mcp` (`--capture-console` / `--capture-network`).
72
+ - **No lighthouse audit** — use `chrome-devtools-mcp`.
73
+ - **No `--firefox` / `--webkit`** — Chromium-CDP only; multi-browser is `playwright-cli`'s lane.
74
+
75
+ ## Stealth mode
76
+
77
+ Obscura's standout feature: build with `--features stealth`, run with `--stealth`. Includes:
78
+ - Per-session fingerprint randomization (GPU, screen, canvas, audio, battery)
79
+ - Realistic `navigator.userAgentData` (Chrome high-entropy values)
80
+ - `navigator.webdriver = undefined`
81
+ - 3,520-domain tracker block
82
+
83
+ Real-mode in this adapter via `extract --stealth <url> --eval EXPR` (since 8-1-iii). Single URL only; `--eval` required (without it `obscura fetch` dumps full HTML, too large for the streaming-event contract). Emits one `extract_stealth` event with `{event, url, eval}`. The `eval` field is always a string in this PR — typed parsing is deferred. Callers needing typed results should `JSON.stringify` inside their `--eval` expression and parse downstream.
84
+
85
+ ```bash
86
+ # String eval (default):
87
+ bash scripts/browser-extract.sh --tool obscura --stealth \
88
+ --eval "document.title" https://example.com
89
+
90
+ # Typed eval (caller-encoded JSON):
91
+ bash scripts/browser-extract.sh --tool obscura --stealth \
92
+ --eval "JSON.stringify({title: document.title, h1: document.querySelector('h1').textContent})" \
93
+ https://example.com
94
+ ```
95
+
96
+ `--scrape` and `--stealth` are mutually exclusive (verb script enforces this).
97
+
98
+ ## See also
99
+
100
+ - [Tool adapter extension model spec](../docs/superpowers/specs/2026-04-30-tool-adapter-extension-model-design.md)
101
+ - [Routing heuristics](routing-heuristics.md)
102
+ - [Tool versions](tool-versions.md)
103
+ - [Adapter candidates](adapter-candidates.md) — other tools considered + declined
@@ -0,0 +1,64 @@
1
+ # playwright-cli — cheatsheet
2
+
3
+ The browser-skill's playwright-cli adapter shells to the `playwright` binary (Microsoft's Playwright CLI). This adapter is the **default** for navigation and inspection verbs; it is the cheap, multi-browser, low-task-token-cost path.
4
+
5
+ ## When the router picks this adapter
6
+
7
+ | Verb | Default? |
8
+ |---|---|
9
+ | `open` | yes |
10
+ | `click` (and `dblclick`) | yes |
11
+ | `fill` (and `type`) | yes |
12
+ | `snapshot` | yes |
13
+ | `inspect` (without `--capture-*`) | yes |
14
+ | `audit` | no — routed to chrome-devtools-mcp |
15
+ | `extract --scrape` | no — routed to obscura |
16
+
17
+ ## Capabilities declared
18
+
19
+ ```json
20
+ {
21
+ "verbs": {
22
+ "open": { "flags": ["--headed", "--viewport", "--user-agent"] },
23
+ "click": { "flags": ["--ref", "--selector"] },
24
+ "fill": { "flags": ["--ref", "--text", "--secret-stdin"] },
25
+ "snapshot": { "flags": [] },
26
+ "inspect": { "flags": ["--selector"] }
27
+ }
28
+ }
29
+ ```
30
+
31
+ ## Doctor check
32
+
33
+ The adapter checks for the `playwright` binary on PATH. To install:
34
+
35
+ ```bash
36
+ npm i -g playwright @playwright/test
37
+ playwright install chromium
38
+ ```
39
+
40
+ ## Version pin
41
+
42
+ - `version_pin: "1.49.x"` — major.minor stability target. Fixture argv-hashes assume the surface CLI shape of 1.49.
43
+
44
+ ## Override
45
+
46
+ To force this adapter even when the router would pick another:
47
+
48
+ ```bash
49
+ bash scripts/browser-<verb>.sh --tool=playwright-cli ...
50
+ ```
51
+
52
+ This is the **Path A entry point** for any new verb that isn't yet in the router's precedence table — it works without router edits.
53
+
54
+ ## Limitations
55
+
56
+ - No console-message or network-HAR capture (use `--tool=chrome-devtools-mcp` or omit `--tool` and pass `--capture-console` / `--capture-network`).
57
+ - No lighthouse audit (chrome-devtools-mcp).
58
+ - No stealth / anti-fingerprinting (obscura).
59
+
60
+ ## See also
61
+
62
+ - [Tool adapter extension model spec](../docs/superpowers/specs/2026-04-30-tool-adapter-extension-model-design.md)
63
+ - [Routing heuristics](routing-heuristics.md)
64
+ - [Tool versions](tool-versions.md)
@@ -0,0 +1,90 @@
1
+ # playwright-lib — cheatsheet
2
+
3
+ The browser-skill's playwright-lib adapter shells to a Node ESM driver
4
+ (`scripts/lib/node/playwright-driver.mjs`) that talks the real Playwright API
5
+ directly. This adapter is the **session-aware** path: it natively supports
6
+ `storageState` loading, `--secret-stdin`, and any future capability that needs
7
+ in-process Node access.
8
+
9
+ ## When the router picks this adapter
10
+
11
+ The default router prefers `playwright-cli` for stateless navigation/inspection
12
+ verbs (cheaper cold start, no node dep on the hot path). `playwright-lib` wins
13
+ when:
14
+
15
+ - `BROWSER_SKILL_STORAGE_STATE` is set (typically because the verb script
16
+ resolved `--site` / `--as` to a stored session) — `rule_session_required`
17
+ picks playwright-lib.
18
+ - `--secret-stdin` is used with the `fill` verb — playwright-cli rejects this
19
+ flag (would leak via argv).
20
+ - The verb is `login` — there is no playwright-cli `login` subcommand.
21
+
22
+ You can always force this adapter via `--tool=playwright-lib`.
23
+
24
+ ## Capabilities declared
25
+
26
+ ```json
27
+ {
28
+ "verbs": {
29
+ "open": { "flags": ["--headed", "--viewport", "--user-agent", "--storage-state"] },
30
+ "click": { "flags": ["--ref", "--selector"] },
31
+ "fill": { "flags": ["--ref", "--text", "--secret-stdin"] },
32
+ "snapshot": { "flags": ["--depth"] },
33
+ "login": { "flags": ["--storage-state"] }
34
+ },
35
+ "session_load": true
36
+ }
37
+ ```
38
+
39
+ Note: `inspect` / `audit` / `extract` are intentionally NOT declared — those
40
+ are chrome-devtools-mcp / obscura territory. The verb-dispatch functions for
41
+ those exist but return `EXIT_TOOL_UNSUPPORTED_OP` (41) so the router never
42
+ picks playwright-lib for them.
43
+
44
+ ## Doctor check
45
+
46
+ Verifies `node` is on PATH and the driver script is present at the expected
47
+ path. To install Node + Playwright + a browser:
48
+
49
+ ```bash
50
+ brew install node # macOS
51
+ npm i -g playwright @playwright/test # lib + test runner
52
+ playwright install chromium # browser binary
53
+ ```
54
+
55
+ ## Version pin
56
+
57
+ - `version_pin: "1.59.x"` — pinned to a major.minor stability target. The
58
+ driver's lazy `import('playwright')` reads whatever version is installed;
59
+ drift between pin and installed version surfaces in `tool_doctor_check`.
60
+
61
+ ## Stub mode
62
+
63
+ Set `BROWSER_SKILL_LIB_STUB=1` to run the driver against
64
+ `tests/fixtures/playwright-lib/<argv-hash>.json` instead of launching a
65
+ browser. Used by the bats suite + CI so tests don't require Playwright
66
+ installed.
67
+
68
+ ## Storage state (session loading)
69
+
70
+ The driver accepts `--storage-state PATH` to apply a Playwright `storageState`
71
+ JSON before navigation. Verb scripts forward `BROWSER_SKILL_STORAGE_STATE`
72
+ (set by `resolve_session_storage_state` in `verb_helpers.sh`) as that flag.
73
+ Origin enforcement happens in the Phase-2 `session_origin_check` lib before
74
+ the env var is exported, so the storageState's origins are guaranteed to
75
+ match the site URL when the driver receives it.
76
+
77
+ ## Secrets
78
+
79
+ `tool_fill --ref eN --secret-stdin` reads the secret from process stdin
80
+ inside the driver — secrets never reach argv (anti-pattern AP-7). The
81
+ playwright-cli adapter rejects `--secret-stdin` for this reason; route to
82
+ playwright-lib explicitly via `--tool=playwright-lib` or rely on the router's
83
+ preference when `--site/--as` resolves a session.
84
+
85
+ ## See also
86
+
87
+ - [Tool adapter extension model spec](../docs/superpowers/specs/2026-04-30-tool-adapter-extension-model-design.md)
88
+ - [Token-efficient adapter output spec](../docs/superpowers/specs/2026-05-01-token-efficient-adapter-output-design.md)
89
+ - [playwright-cli cheatsheet](playwright-cli-cheatsheet.md) — the binary-shelled sibling adapter.
90
+ - [Tool versions](tool-versions.md)