browser-automation-skill 0.71.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +144 -0
  3. package/SECURITY.md +39 -0
  4. package/SKILL.md +206 -0
  5. package/bin/cli.mjs +55 -0
  6. package/install.sh +143 -0
  7. package/package.json +54 -0
  8. package/references/adapter-candidates.md +40 -0
  9. package/references/browser-mcp-cheatsheet.md +132 -0
  10. package/references/browser-stats-cheatsheet.md +155 -0
  11. package/references/chrome-devtools-mcp-cheatsheet.md +232 -0
  12. package/references/midscene-integration.md +359 -0
  13. package/references/obscura-cheatsheet.md +103 -0
  14. package/references/playwright-cli-cheatsheet.md +64 -0
  15. package/references/playwright-lib-cheatsheet.md +90 -0
  16. package/references/recipes/add-a-tool-adapter.md +134 -0
  17. package/references/recipes/agent-workflows/README.md +37 -0
  18. package/references/recipes/agent-workflows/cache-driven-bulk-operation.md +110 -0
  19. package/references/recipes/agent-workflows/flow-record-and-replay.md +102 -0
  20. package/references/recipes/agent-workflows/incremental-pattern-discovery.md +125 -0
  21. package/references/recipes/agent-workflows/login-then-scrape.md +100 -0
  22. package/references/recipes/anti-patterns-tool-extension.md +182 -0
  23. package/references/recipes/body-bytes-not-body.md +139 -0
  24. package/references/recipes/cache-write-security.md +210 -0
  25. package/references/recipes/fingerprint-rescue.md +154 -0
  26. package/references/recipes/model-routing.md +143 -0
  27. package/references/recipes/path-security.md +138 -0
  28. package/references/recipes/privacy-canary.md +96 -0
  29. package/references/recipes/visual-rescue-hook.md +182 -0
  30. package/references/stats-prices.json +42 -0
  31. package/references/stats-schema.json +77 -0
  32. package/references/tool-versions.md +8 -0
  33. package/scripts/browser-add-site.sh +113 -0
  34. package/scripts/browser-assert.sh +106 -0
  35. package/scripts/browser-audit.sh +68 -0
  36. package/scripts/browser-baseline.sh +135 -0
  37. package/scripts/browser-click.sh +100 -0
  38. package/scripts/browser-creds-add.sh +254 -0
  39. package/scripts/browser-creds-list.sh +67 -0
  40. package/scripts/browser-creds-migrate.sh +122 -0
  41. package/scripts/browser-creds-remove.sh +69 -0
  42. package/scripts/browser-creds-rotate-totp.sh +109 -0
  43. package/scripts/browser-creds-show.sh +82 -0
  44. package/scripts/browser-creds-totp.sh +94 -0
  45. package/scripts/browser-do.sh +630 -0
  46. package/scripts/browser-doctor.sh +365 -0
  47. package/scripts/browser-drag.sh +90 -0
  48. package/scripts/browser-extract.sh +192 -0
  49. package/scripts/browser-fill.sh +142 -0
  50. package/scripts/browser-flow.sh +316 -0
  51. package/scripts/browser-history.sh +187 -0
  52. package/scripts/browser-hover.sh +92 -0
  53. package/scripts/browser-inspect.sh +188 -0
  54. package/scripts/browser-list-sessions.sh +78 -0
  55. package/scripts/browser-list-sites.sh +42 -0
  56. package/scripts/browser-login.sh +279 -0
  57. package/scripts/browser-mcp.sh +65 -0
  58. package/scripts/browser-migrate.sh +195 -0
  59. package/scripts/browser-open.sh +134 -0
  60. package/scripts/browser-press.sh +80 -0
  61. package/scripts/browser-remove-session.sh +72 -0
  62. package/scripts/browser-remove-site.sh +68 -0
  63. package/scripts/browser-replay.sh +206 -0
  64. package/scripts/browser-route.sh +174 -0
  65. package/scripts/browser-select.sh +122 -0
  66. package/scripts/browser-show-session.sh +57 -0
  67. package/scripts/browser-show-site.sh +37 -0
  68. package/scripts/browser-snapshot.sh +176 -0
  69. package/scripts/browser-stats.sh +522 -0
  70. package/scripts/browser-tab-close.sh +112 -0
  71. package/scripts/browser-tab-list.sh +70 -0
  72. package/scripts/browser-tab-switch.sh +111 -0
  73. package/scripts/browser-upload.sh +132 -0
  74. package/scripts/browser-use.sh +60 -0
  75. package/scripts/browser-vlm.sh +707 -0
  76. package/scripts/browser-wait.sh +97 -0
  77. package/scripts/install-git-hooks.sh +16 -0
  78. package/scripts/lib/capture.sh +356 -0
  79. package/scripts/lib/common.sh +262 -0
  80. package/scripts/lib/credential.sh +237 -0
  81. package/scripts/lib/fingerprint-rescue.js +123 -0
  82. package/scripts/lib/flow.sh +448 -0
  83. package/scripts/lib/flow_record.sh +210 -0
  84. package/scripts/lib/mask.sh +49 -0
  85. package/scripts/lib/memory.sh +427 -0
  86. package/scripts/lib/migrate.sh +390 -0
  87. package/scripts/lib/migrators/README.md +23 -0
  88. package/scripts/lib/migrators/memory/v1_to_v2.sh +15 -0
  89. package/scripts/lib/migrators/recent_urls/README.md +13 -0
  90. package/scripts/lib/migrators/stats/README.md +24 -0
  91. package/scripts/lib/node/chrome-devtools-bridge.mjs +1812 -0
  92. package/scripts/lib/node/mcp-server.mjs +531 -0
  93. package/scripts/lib/node/mcp-tools.json +68 -0
  94. package/scripts/lib/node/playwright-driver.mjs +1104 -0
  95. package/scripts/lib/node/totp-core.mjs +52 -0
  96. package/scripts/lib/node/totp.mjs +52 -0
  97. package/scripts/lib/node/url-pattern-cluster.mjs +102 -0
  98. package/scripts/lib/node/url-pattern-resolver.mjs +77 -0
  99. package/scripts/lib/output.sh +79 -0
  100. package/scripts/lib/router.sh +342 -0
  101. package/scripts/lib/sanitize.sh +107 -0
  102. package/scripts/lib/secret/keychain.sh +91 -0
  103. package/scripts/lib/secret/libsecret.sh +74 -0
  104. package/scripts/lib/secret/plaintext.sh +75 -0
  105. package/scripts/lib/secret_backend_select.sh +57 -0
  106. package/scripts/lib/session.sh +153 -0
  107. package/scripts/lib/site.sh +126 -0
  108. package/scripts/lib/stats.sh +419 -0
  109. package/scripts/lib/tool/.gitkeep +0 -0
  110. package/scripts/lib/tool/chrome-devtools-mcp.sh +349 -0
  111. package/scripts/lib/tool/obscura.sh +249 -0
  112. package/scripts/lib/tool/playwright-cli.sh +155 -0
  113. package/scripts/lib/tool/playwright-lib.sh +106 -0
  114. package/scripts/lib/verb_helpers.sh +222 -0
  115. package/scripts/lib/visual-rescue-default.sh +145 -0
  116. package/scripts/regenerate-docs.sh +99 -0
  117. package/uninstall.sh +51 -0
@@ -0,0 +1,707 @@
1
+ #!/usr/bin/env bash
2
+ # scripts/browser-vlm.sh — wrap llama-server for local-VLM use w/ the
3
+ # session-validated lean config.
4
+ #
5
+ # Usage:
6
+ # bash scripts/browser-vlm.sh start [--dry-run]
7
+ # bash scripts/browser-vlm.sh stop
8
+ # bash scripts/browser-vlm.sh status
9
+ # bash scripts/browser-vlm.sh smoke
10
+ # bash scripts/browser-vlm.sh --help
11
+ #
12
+ # Phase 14: spawned during midscene-integration smoke runs proved the FAT
13
+ # default config (parallel=4, ctx=175616, threads=all-P-cores) was 18-36x
14
+ # slower than the lean config below on the same hardware. This helper
15
+ # bakes the lean numbers in so users don't retype 6 flags every session.
16
+ #
17
+ # State (mode 0600 inside the mode-0700 BROWSER_SKILL_HOME):
18
+ # ~/.browser-skill/vlm.pid PID of running llama-server
19
+ # ~/.browser-skill/vlm.log stdout+stderr from llama-server
20
+ #
21
+ # Env overrides (defaults validated 2026-05-20 on M3 Pro / 36 GB):
22
+ # BROWSER_SKILL_VLM_MODEL Qwen/Qwen3-VL-4B-Instruct-GGUF:Q4_K_M
23
+ # BROWSER_SKILL_VLM_HOST 127.0.0.1
24
+ # BROWSER_SKILL_VLM_PORT 8080
25
+ # BROWSER_SKILL_VLM_CTX_SIZE 8192
26
+ # BROWSER_SKILL_VLM_PARALLEL 1
27
+ # BROWSER_SKILL_VLM_THREADS 4
28
+ # BROWSER_SKILL_VLM_THREADS_BATCH 6
29
+ # BROWSER_SKILL_VLM_CACHE_RAM_MB 512
30
+ # BROWSER_SKILL_VLM_NGL 99 (Metal layers; macOS default)
31
+ # BROWSER_SKILL_NODE_BIN (unused here; reserved for parity w/ browser-mcp)
32
+ # LLAMA_SERVER_BIN llama-server
33
+
34
+ set -euo pipefail
35
+ IFS=$'\n\t'
36
+
37
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
38
+ # shellcheck source=lib/common.sh
39
+ # shellcheck disable=SC1091
40
+ source "${SCRIPT_DIR}/lib/common.sh"
41
+
42
+ LLAMA_SERVER_BIN="${LLAMA_SERVER_BIN:-llama-server}"
43
+ VLM_MODEL="${BROWSER_SKILL_VLM_MODEL:-Qwen/Qwen3-VL-4B-Instruct-GGUF:Q4_K_M}"
44
+ VLM_HOST="${BROWSER_SKILL_VLM_HOST:-127.0.0.1}"
45
+ VLM_PORT="${BROWSER_SKILL_VLM_PORT:-8080}"
46
+ VLM_CTX="${BROWSER_SKILL_VLM_CTX_SIZE:-8192}"
47
+ VLM_PARALLEL="${BROWSER_SKILL_VLM_PARALLEL:-1}"
48
+ VLM_THREADS="${BROWSER_SKILL_VLM_THREADS:-4}"
49
+ VLM_THREADS_BATCH="${BROWSER_SKILL_VLM_THREADS_BATCH:-6}"
50
+ VLM_CACHE_RAM="${BROWSER_SKILL_VLM_CACHE_RAM_MB:-512}"
51
+ VLM_NGL="${BROWSER_SKILL_VLM_NGL:-99}"
52
+
53
+ BROWSER_SKILL_HOME="${BROWSER_SKILL_HOME:-${HOME}/.browser-skill}"
54
+ VLM_PID_FILE="${BROWSER_SKILL_HOME}/vlm.pid"
55
+ VLM_LOG_FILE="${BROWSER_SKILL_HOME}/vlm.log"
56
+ VLM_WATCHDOG_PID_FILE="${BROWSER_SKILL_HOME}/vlm-watchdog.pid"
57
+ VLM_LAST_USED_FILE="${BROWSER_SKILL_HOME}/vlm.last-used"
58
+ VLM_IDLE_TIMEOUT_S="${BROWSER_SKILL_VLM_IDLE_TIMEOUT:-600}" # 10 min default
59
+ VLM_WATCHDOG_POLL_S="${BROWSER_SKILL_VLM_WATCHDOG_POLL:-60}" # 1 min default
60
+
61
+ _ensure_home() {
62
+ mkdir -p "${BROWSER_SKILL_HOME}" 2>/dev/null || die "${EXIT_GENERIC_ERROR}" \
63
+ "cannot create ${BROWSER_SKILL_HOME}"
64
+ chmod 700 "${BROWSER_SKILL_HOME}" 2>/dev/null || true
65
+ }
66
+
67
+ _lean_argv() {
68
+ printf '%s\n' \
69
+ -hf "${VLM_MODEL}" \
70
+ --host "${VLM_HOST}" \
71
+ --port "${VLM_PORT}" \
72
+ --ctx-size "${VLM_CTX}" \
73
+ --parallel "${VLM_PARALLEL}" \
74
+ --threads "${VLM_THREADS}" \
75
+ --threads-batch "${VLM_THREADS_BATCH}" \
76
+ --cache-ram "${VLM_CACHE_RAM}" \
77
+ --n-gpu-layers "${VLM_NGL}"
78
+ }
79
+
80
+ _pid_alive() {
81
+ local pid="${1:-}"
82
+ [ -n "${pid}" ] || return 1
83
+ kill -0 "${pid}" 2>/dev/null
84
+ }
85
+
86
+ # _wait_port_free PORT [MAX_WAIT_S]
87
+ # Returns 0 when nothing's listening on PORT, 1 if still bound after MAX_WAIT_S.
88
+ # Polls every 1s. Phase 14 bench-fix: needed because cmd_start spawning
89
+ # llama-server while port is TIME_WAIT'd by a prior instance caused silent
90
+ # bind failure + bench talking to the wrong (still-loaded) model.
91
+ _wait_port_free() {
92
+ local port="${1:?port required}" max_wait="${2:-5}" waited=0
93
+ while [ "${waited}" -lt "${max_wait}" ]; do
94
+ if ! lsof -nP -iTCP:"${port}" -sTCP:LISTEN >/dev/null 2>&1; then
95
+ return 0
96
+ fi
97
+ sleep 1
98
+ waited=$((waited + 1))
99
+ done
100
+ return 1
101
+ }
102
+
103
+ _read_pid() {
104
+ [ -f "${VLM_PID_FILE}" ] || return 1
105
+ local pid
106
+ pid="$(cat "${VLM_PID_FILE}" 2>/dev/null)" || return 1
107
+ [ -n "${pid}" ] || return 1
108
+ printf '%s' "${pid}"
109
+ }
110
+
111
+ cmd_start() {
112
+ local dry_run=0
113
+ while [ "$#" -gt 0 ]; do
114
+ case "$1" in
115
+ --dry-run) dry_run=1; shift ;;
116
+ *) die "${EXIT_USAGE_ERROR}" "start: unknown flag '${1}'" ;;
117
+ esac
118
+ done
119
+
120
+ _ensure_home
121
+
122
+ if existing="$(_read_pid)" && _pid_alive "${existing}"; then
123
+ ok "vlm already running (pid ${existing}) at http://${VLM_HOST}:${VLM_PORT}"
124
+ return 0
125
+ fi
126
+
127
+ if [ "${dry_run}" = "1" ]; then
128
+ printf '%s ' "${LLAMA_SERVER_BIN}"
129
+ local arg
130
+ while IFS= read -r arg; do
131
+ printf '%s ' "${arg}"
132
+ done < <(_lean_argv)
133
+ printf '\n'
134
+ ok "dry-run: would launch llama-server with lean config above"
135
+ return 0
136
+ fi
137
+
138
+ if ! command -v "${LLAMA_SERVER_BIN}" >/dev/null 2>&1; then
139
+ die "${EXIT_TOOL_MISSING}" "${LLAMA_SERVER_BIN} not on PATH — brew install llama.cpp"
140
+ fi
141
+
142
+ # Port-rebind safety (bench-fix). Refuse to spawn if the port is already
143
+ # bound by something else — otherwise llama-server bind-fails silently
144
+ # inside nohup, the recorded PID is the wrapper shell's child (not the
145
+ # actual server), /health continues to answer from whoever was already
146
+ # there, and bench ends up talking to the wrong model.
147
+ if ! _wait_port_free "${VLM_PORT}" 5; then
148
+ die "${EXIT_PREFLIGHT_FAILED}" \
149
+ "port ${VLM_PORT} still bound after 5s — check 'lsof -nP -iTCP:${VLM_PORT}' and stop the holder"
150
+ fi
151
+
152
+ # Detached spawn; redirect stdout+stderr to log; record PID.
153
+ local argv=()
154
+ while IFS= read -r arg; do
155
+ argv+=("${arg}")
156
+ done < <(_lean_argv)
157
+
158
+ # nohup + setsid pattern keeps child alive across this shell's exit.
159
+ : > "${VLM_LOG_FILE}"
160
+ chmod 600 "${VLM_LOG_FILE}" 2>/dev/null || true
161
+ nohup "${LLAMA_SERVER_BIN}" "${argv[@]}" >> "${VLM_LOG_FILE}" 2>&1 &
162
+ local pid=$!
163
+ printf '%s\n' "${pid}" > "${VLM_PID_FILE}"
164
+ chmod 600 "${VLM_PID_FILE}" 2>/dev/null || true
165
+
166
+ ok "vlm starting (pid ${pid}) — first launch downloads ~3.5 GB to ~/.cache/huggingface/"
167
+ ok "log: tail -f ${VLM_LOG_FILE}"
168
+ ok "ping: curl http://${VLM_HOST}:${VLM_PORT}/health (returns {\"status\":\"ok\"} when ready)"
169
+
170
+ # Phase 14+ smart-stop: spawn an idle-stop watchdog companion. Polls
171
+ # ${VLM_LAST_USED_FILE} mtime every BROWSER_SKILL_VLM_WATCHDOG_POLL seconds;
172
+ # if older than BROWSER_SKILL_VLM_IDLE_TIMEOUT, calls cmd_stop. Without
173
+ # this, llama-server would hold ~4 GB resident forever after first use.
174
+ # The default-rescue probe (scripts/lib/visual-rescue-default.sh) touches
175
+ # the last-used file on every invocation, so the watchdog only counts
176
+ # REAL usage (not /health pings).
177
+ # Disable via BROWSER_SKILL_VLM_IDLE_TIMEOUT=0 (never stop).
178
+ if [ "${VLM_IDLE_TIMEOUT_S}" -gt 0 ]; then
179
+ : > "${VLM_LAST_USED_FILE}"
180
+ chmod 600 "${VLM_LAST_USED_FILE}" 2>/dev/null || true
181
+ nohup bash -c "
182
+ set -u
183
+ while true; do
184
+ sleep ${VLM_WATCHDOG_POLL_S}
185
+ # Server gone → watchdog exits.
186
+ if [ ! -f '${VLM_PID_FILE}' ]; then break; fi
187
+ srv_pid=\$(cat '${VLM_PID_FILE}' 2>/dev/null || true)
188
+ [ -n \"\${srv_pid}\" ] || break
189
+ kill -0 \"\${srv_pid}\" 2>/dev/null || break
190
+ # Idle check via last-used mtime.
191
+ if [ -f '${VLM_LAST_USED_FILE}' ]; then
192
+ now_s=\$(date +%s)
193
+ last_s=\$(stat -f %m '${VLM_LAST_USED_FILE}' 2>/dev/null \
194
+ || stat -c %Y '${VLM_LAST_USED_FILE}' 2>/dev/null \
195
+ || echo \"\${now_s}\")
196
+ age=\$((now_s - last_s))
197
+ if [ \"\${age}\" -ge ${VLM_IDLE_TIMEOUT_S} ]; then
198
+ kill \"\${srv_pid}\" 2>/dev/null || true
199
+ rm -f '${VLM_PID_FILE}' '${VLM_LAST_USED_FILE}' 2>/dev/null || true
200
+ break
201
+ fi
202
+ fi
203
+ done
204
+ rm -f '${VLM_WATCHDOG_PID_FILE}' 2>/dev/null || true
205
+ " >> "${VLM_LOG_FILE}" 2>&1 &
206
+ watchdog_pid=$!
207
+ printf '%s\n' "${watchdog_pid}" > "${VLM_WATCHDOG_PID_FILE}"
208
+ chmod 600 "${VLM_WATCHDOG_PID_FILE}" 2>/dev/null || true
209
+ ok "watchdog (pid ${watchdog_pid}) — idle-stop after ${VLM_IDLE_TIMEOUT_S}s no use"
210
+ fi
211
+ }
212
+
213
+ cmd_stop() {
214
+ # Kill watchdog first so it doesn't see the server vanish and panic.
215
+ if [ -f "${VLM_WATCHDOG_PID_FILE}" ]; then
216
+ wd_pid=$(cat "${VLM_WATCHDOG_PID_FILE}" 2>/dev/null || true)
217
+ if [ -n "${wd_pid}" ] && kill -0 "${wd_pid}" 2>/dev/null; then
218
+ kill "${wd_pid}" 2>/dev/null || true
219
+ fi
220
+ rm -f "${VLM_WATCHDOG_PID_FILE}" 2>/dev/null || true
221
+ fi
222
+ if existing="$(_read_pid)" && _pid_alive "${existing}"; then
223
+ kill "${existing}" 2>/dev/null || true
224
+ # Give it 2s to exit cleanly; SIGKILL if still alive.
225
+ local _
226
+ for _ in 1 2 3 4; do
227
+ _pid_alive "${existing}" || break
228
+ sleep 0.5
229
+ done
230
+ if _pid_alive "${existing}"; then
231
+ kill -9 "${existing}" 2>/dev/null || true
232
+ fi
233
+ ok "vlm stopped (pid ${existing})"
234
+ else
235
+ ok "vlm not running (no-op)"
236
+ fi
237
+ rm -f "${VLM_PID_FILE}" "${VLM_LAST_USED_FILE}" 2>/dev/null || true
238
+ # Port-release wait — bench-fix companion. PID dying doesn't always release
239
+ # the port instantly on macOS (TIME_WAIT). 5s is enough for SO_REUSEADDR
240
+ # paths; if still bound, warn but don't fail (caller may have spawned a
241
+ # different listener that we don't own).
242
+ _wait_port_free "${VLM_PORT}" 5 \
243
+ || warn "port ${VLM_PORT} still bound after stop; next 'start' may fail-fast (use lsof to inspect)"
244
+ }
245
+
246
+ cmd_status() {
247
+ local pid
248
+ if pid="$(_read_pid)" && _pid_alive "${pid}"; then
249
+ local health_body
250
+ if health_body="$(curl -sfm 3 "http://${VLM_HOST}:${VLM_PORT}/health" 2>/dev/null)"; then
251
+ ok "vlm running (pid ${pid}) — endpoint http://${VLM_HOST}:${VLM_PORT} healthy"
252
+ printf '%s\n' "${health_body}"
253
+ return 0
254
+ else
255
+ warn "vlm pid ${pid} alive but /health unreachable (still loading model?)"
256
+ return 11
257
+ fi
258
+ else
259
+ warn "vlm not running"
260
+ return 11
261
+ fi
262
+ }
263
+
264
+ # Module-scope smoke helpers (refactored from cmd_smoke so cmd_bench can also
265
+ # call them per-model). Each emits one JSONL line on stdout + updates the
266
+ # caller's SMOKE_PASS / SMOKE_FAIL globals — keeps the function pure-ish
267
+ # without IPC. Endpoint resolution defers to VLM_HOST + VLM_PORT at call time.
268
+ SMOKE_PASS=0
269
+ SMOKE_FAIL=0
270
+
271
+ _smoke_endpoint() {
272
+ printf 'http://%s:%s/v1/chat/completions\n' "${VLM_HOST}" "${VLM_PORT}"
273
+ }
274
+
275
+ _run_text_smoke() {
276
+ local label="$1" prompt="$2" t0 t1 lat resp completion
277
+ local endpoint
278
+ endpoint="$(_smoke_endpoint)"
279
+ t0=$(python3 -c "import time;print(time.time())" 2>/dev/null || date +%s)
280
+ resp="$(curl -sS -m 30 "${endpoint}" -H 'Content-Type: application/json' \
281
+ -d "$(jq -n --arg p "${prompt}" '{model:"q",max_tokens:12,messages:[{role:"user",content:$p}]}')" 2>/dev/null)"
282
+ t1=$(python3 -c "import time;print(time.time())" 2>/dev/null || date +%s)
283
+ lat=$(python3 -c "print(round($t1 - $t0, 2))" 2>/dev/null || echo "n/a")
284
+ completion="$(printf '%s' "${resp}" | jq -r '.choices[0].message.content // .error.message' 2>/dev/null)"
285
+ printf '{"smoke":"%s","type":"text","latency_s":%s,"completion":"%s"}\n' \
286
+ "${label}" "${lat}" "${completion//\"/\\\"}"
287
+ if [ -n "${completion}" ] && [ "${completion}" != "null" ]; then
288
+ SMOKE_PASS=$((SMOKE_PASS + 1))
289
+ else
290
+ SMOKE_FAIL=$((SMOKE_FAIL + 1))
291
+ fi
292
+ }
293
+
294
+ _run_vision_smoke() {
295
+ local label="$1" rgb="$2" expected="$3" t0 t1 lat resp completion png_b64
296
+ local endpoint
297
+ endpoint="$(_smoke_endpoint)"
298
+ png_b64="$(python3 <<EOF
299
+ import struct, zlib, base64
300
+ W=H=64
301
+ r,g,b=${rgb}
302
+ raw=b''
303
+ for _ in range(H):
304
+ raw += b'\\x00' + bytes((r,g,b))*W
305
+ def chunk(k,d): return struct.pack('>I', len(d)) + k + d + struct.pack('>I', zlib.crc32(k+d))
306
+ png = b'\\x89PNG\\r\\n\\x1a\\n'
307
+ png += chunk(b'IHDR', struct.pack('>IIBBBBB', W, H, 8, 2, 0, 0, 0))
308
+ png += chunk(b'IDAT', zlib.compress(raw, 9))
309
+ png += chunk(b'IEND', b'')
310
+ print(base64.b64encode(png).decode())
311
+ EOF
312
+ )"
313
+ t0=$(python3 -c "import time;print(time.time())")
314
+ resp="$(curl -sS -m 60 "${endpoint}" -H 'Content-Type: application/json' \
315
+ -d "$(jq -n --arg img "data:image/png;base64,${png_b64}" '
316
+ {model:"q",max_tokens:20,messages:[{role:"user",content:[
317
+ {type:"text",text:"One word: dominant color of this image?"},
318
+ {type:"image_url",image_url:{url:$img}}
319
+ ]}]}')" 2>/dev/null)"
320
+ t1=$(python3 -c "import time;print(time.time())")
321
+ lat=$(python3 -c "print(round($t1 - $t0, 2))")
322
+ completion="$(printf '%s' "${resp}" | jq -r '.choices[0].message.content // .error.message' 2>/dev/null)"
323
+ local hit="false"
324
+ case "${completion,,}" in *"${expected}"*) hit="true" ;; esac
325
+ printf '{"smoke":"%s","type":"vision","latency_s":%s,"completion":"%s","expected":"%s","hit":%s}\n' \
326
+ "${label}" "${lat}" "${completion//\"/\\\"}" "${expected}" "${hit}"
327
+ if [ "${hit}" = "true" ]; then SMOKE_PASS=$((SMOKE_PASS + 1));
328
+ else SMOKE_FAIL=$((SMOKE_FAIL + 1)); fi
329
+ }
330
+
331
+ _run_vision_fixture_smoke() {
332
+ # Path-3-relevant: read a pre-rendered PNG fixture (e.g. a button shape) and
333
+ # ask the model to identify it. Hits when the completion contains EXPECTED
334
+ # (case-insensitive). The synthetic-color smokes are kept as
335
+ # vision-pipeline-works sanity checks; this smoke is the real grounding test.
336
+ local label="$1" png_path="$2" expected="$3" prompt="$4"
337
+ local t0 t1 lat resp completion png_b64
338
+ local endpoint
339
+ endpoint="$(_smoke_endpoint)"
340
+ if [ ! -f "${png_path}" ]; then
341
+ printf '{"smoke":"%s","type":"vision-fixture","status":"fixture-missing","path":"%s"}\n' \
342
+ "${label}" "${png_path}"
343
+ SMOKE_FAIL=$((SMOKE_FAIL + 1))
344
+ return 0
345
+ fi
346
+ png_b64="$(base64 -i "${png_path}" | tr -d '\n')"
347
+ t0=$(python3 -c "import time;print(time.time())")
348
+ resp="$(curl -sS -m 60 "${endpoint}" -H 'Content-Type: application/json' \
349
+ -d "$(jq -n \
350
+ --arg img "data:image/png;base64,${png_b64}" \
351
+ --arg prompt "${prompt}" '
352
+ {model:"q",max_tokens:30,messages:[{role:"user",content:[
353
+ {type:"text",text:$prompt},
354
+ {type:"image_url",image_url:{url:$img}}
355
+ ]}]}')" 2>/dev/null)"
356
+ t1=$(python3 -c "import time;print(time.time())")
357
+ lat=$(python3 -c "print(round($t1 - $t0, 2))")
358
+ completion="$(printf '%s' "${resp}" | jq -r '.choices[0].message.content // .error.message' 2>/dev/null)"
359
+ # Broader matcher (bench-fix #5): EXPECTED can be a |-delimited list of
360
+ # acceptable terms — hit if completion contains ANY (case-insensitive).
361
+ # Why: open-ended description prompts produce nuanced answers ("Blue
362
+ # rectangle" is a structurally-correct description of a button-shape
363
+ # fixture); narrow single-term matching false-fails them. Single-term
364
+ # expected still works (no | in the string).
365
+ local hit="false" term
366
+ local _saved_ifs="${IFS}"
367
+ IFS='|'
368
+ for term in ${expected}; do
369
+ [ -z "${term}" ] && continue
370
+ case "${completion,,}" in
371
+ *"${term,,}"*) hit="true"; break ;;
372
+ esac
373
+ done
374
+ IFS="${_saved_ifs}"
375
+ printf '{"smoke":"%s","type":"vision-fixture","latency_s":%s,"completion":"%s","expected":"%s","hit":%s}\n' \
376
+ "${label}" "${lat}" "${completion//\"/\\\"}" "${expected}" "${hit}"
377
+ if [ "${hit}" = "true" ]; then SMOKE_PASS=$((SMOKE_PASS + 1));
378
+ else SMOKE_FAIL=$((SMOKE_FAIL + 1)); fi
379
+ }
380
+
381
+ # Run the smoke battery against the current ${VLM_HOST}:${VLM_PORT}. Resets
382
+ # SMOKE_PASS / SMOKE_FAIL before running. Returns 0 if all smokes passed, 1
383
+ # otherwise.
384
+ #
385
+ # Battery composition (5 smokes; the last is the Path-3 grounding test):
386
+ # text_cold + text_warm — text completion + warmup speedup
387
+ # vision_red + vision_green — synthetic-color pipeline sanity (NOT a Path-3
388
+ # test — pure RGB is out-of-distribution; both
389
+ # pass means vision wiring works, both
390
+ # failing means model is colorblind on
391
+ # synthetic stimuli, not necessarily blind on
392
+ # rendered UI)
393
+ # vision_button — rendered-button-shape PNG fixture; THIS is
394
+ # the Path-3 unblock signal. Hits when the
395
+ # model identifies a button or recognises the
396
+ # blue rectangle as a UI element.
397
+ _run_smoke_battery() {
398
+ SMOKE_PASS=0; SMOKE_FAIL=0
399
+ _run_text_smoke "text_cold" "Say hi in exactly one word."
400
+ _run_text_smoke "text_warm" "Reply in exactly two words."
401
+ if command -v python3 >/dev/null 2>&1; then
402
+ _run_vision_smoke "vision_red" "224,16,16" "red"
403
+ _run_vision_smoke "vision_green" "0,192,32" "green"
404
+ else
405
+ warn "python3 missing — skipping synthetic-color vision smokes"
406
+ fi
407
+ local button_fixture
408
+ button_fixture="${BENCH_FIXTURE_BUTTON:-${SCRIPT_DIR}/../tests/fixtures/vlm-bench/button-shape.png}"
409
+ _run_vision_fixture_smoke "vision_button" "${button_fixture}" \
410
+ "button|rectangle|rounded|shape|blue rectangle|ui element" \
411
+ "Describe what you see in this image in one or two words."
412
+ [ "${SMOKE_FAIL}" -eq 0 ]
413
+ }
414
+
415
+ cmd_smoke() {
416
+ # Requires the server to be up. Runs the 5-smoke battery from
417
+ # references/midscene-integration.md and emits one JSON line per smoke,
418
+ # then a final aggregate line — same shape contract as our verb scripts.
419
+ if ! curl -sfm 3 "http://${VLM_HOST}:${VLM_PORT}/health" >/dev/null 2>&1; then
420
+ die "${EXIT_PREFLIGHT_FAILED}" \
421
+ "vlm not reachable at http://${VLM_HOST}:${VLM_PORT} — run 'browser-vlm start' first"
422
+ fi
423
+ _run_smoke_battery
424
+ local rc=$?
425
+ printf '{"summary":"vlm-smoke","pass":%d,"fail":%d,"endpoint":"http://%s:%s"}\n' \
426
+ "${SMOKE_PASS}" "${SMOKE_FAIL}" "${VLM_HOST}" "${VLM_PORT}"
427
+ return "${rc}"
428
+ }
429
+
430
+ # Default model presets for `vlm bench`. Chosen so the table directly answers
431
+ # "is Path 3 cache-rescue unblockable at this size?".
432
+ _BENCH_DEFAULT_MODELS=(
433
+ "Qwen/Qwen3-VL-4B-Instruct-GGUF:Q4_K_M" # baseline (current local install)
434
+ "Qwen/Qwen3-VL-4B-Instruct-GGUF:Q8_0" # same params, less quantization
435
+ "Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M" # midscene's recommended default
436
+ )
437
+
438
+ cmd_bench() {
439
+ # Iterate a list of model tags. For each: stop any running vlm, set the
440
+ # model env var, start fresh, wait for /health, run the 4-smoke battery,
441
+ # stop. Emit one per-model JSONL row + a final summary line. Default model
442
+ # list answers the Path-3 unblock question — override by passing models on
443
+ # the command line.
444
+ #
445
+ # --dry-run print which models would be benched + exit 0
446
+ # --max-wait-s N seconds to wait for each model's /health (default 600)
447
+ local dry_run=0 max_wait_s=600
448
+ local models=()
449
+ while [ "$#" -gt 0 ]; do
450
+ case "$1" in
451
+ --dry-run) dry_run=1; shift ;;
452
+ --max-wait-s) max_wait_s="$2"; shift 2 ;;
453
+ --help|-h)
454
+ cat <<'BENCHUSAGE'
455
+ browser-vlm bench [--dry-run] [--max-wait-s N] [MODEL [MODEL ...]]
456
+
457
+ Bench multiple models against the same 4-smoke battery (text-cold, vision-red,
458
+ vision-green, text-warm). Stops any running vlm first, then for each model:
459
+ start → wait /health → smoke → stop. Emits one JSONL row per model + final.
460
+
461
+ Default model list (answers the Path-3 unblock question):
462
+ Qwen/Qwen3-VL-4B-Instruct-GGUF:Q4_K_M
463
+ Qwen/Qwen3-VL-4B-Instruct-GGUF:Q8_0
464
+ Qwen/Qwen3-VL-8B-Instruct-GGUF:Q4_K_M
465
+
466
+ Use --dry-run to confirm the list without downloading anything.
467
+ BENCHUSAGE
468
+ return 0
469
+ ;;
470
+ -*) die "${EXIT_USAGE_ERROR}" "bench: unknown flag '${1}'" ;;
471
+ *) models+=("$1"); shift ;;
472
+ esac
473
+ done
474
+ [ "${#models[@]}" -gt 0 ] || models=("${_BENCH_DEFAULT_MODELS[@]}")
475
+
476
+ _ensure_home
477
+
478
+ # Emit start event (machine-parseable plan).
479
+ local models_json
480
+ models_json="$(printf '%s\n' "${models[@]}" | jq -R -s -c 'split("\n") | map(select(length > 0))')"
481
+ printf '{"event":"bench-start","models":%s,"ts":"%s"}\n' \
482
+ "${models_json}" "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
483
+
484
+ if [ "${dry_run}" = "1" ]; then
485
+ ok "dry-run: would bench ${#models[@]} model(s); no downloads, no spawns"
486
+ printf '{"event":"bench-done","total_models":%d,"dry_run":true}\n' "${#models[@]}"
487
+ return 0
488
+ fi
489
+
490
+ if ! command -v "${LLAMA_SERVER_BIN}" >/dev/null 2>&1; then
491
+ die "${EXIT_TOOL_MISSING}" "${LLAMA_SERVER_BIN} not on PATH — brew install llama.cpp"
492
+ fi
493
+
494
+ local model bench_pass=0 bench_fail=0
495
+ for model in "${models[@]}"; do
496
+ # Defensive stop (whatever was running before).
497
+ cmd_stop >/dev/null 2>&1 || true
498
+
499
+ # Spawn this model. Subshell isolates `die` from inside cmd_start so a
500
+ # missing tool / busy port doesn't kill the whole bench — instead we
501
+ # record the per-model failure and continue.
502
+ if ! (BROWSER_SKILL_VLM_MODEL="${model}" cmd_start) >/dev/null 2>&1; then
503
+ printf '{"event":"bench-model","model":"%s","status":"start-failed"}\n' \
504
+ "${model}"
505
+ bench_fail=$((bench_fail + 1))
506
+ continue
507
+ fi
508
+ # Poll /health with bounded wait so a slow download doesn't hang forever.
509
+ local waited=0
510
+ while [ "${waited}" -lt "${max_wait_s}" ]; do
511
+ if curl -sfm 2 "http://${VLM_HOST}:${VLM_PORT}/health" >/dev/null 2>&1; then
512
+ break
513
+ fi
514
+ sleep 5
515
+ waited=$((waited + 5))
516
+ done
517
+
518
+ if ! curl -sfm 2 "http://${VLM_HOST}:${VLM_PORT}/health" >/dev/null 2>&1; then
519
+ printf '{"event":"bench-model","model":"%s","status":"timeout","wait_s":%d}\n' \
520
+ "${model}" "${max_wait_s}"
521
+ bench_fail=$((bench_fail + 1))
522
+ cmd_stop >/dev/null 2>&1 || true
523
+ continue
524
+ fi
525
+
526
+ # Bench-fix #4: model-identity verification. llama-server's -hf flag
527
+ # silently falls back to whatever's already cached in the HF repo dir
528
+ # when the requested quant can't be fetched. /health still returns 200.
529
+ # Without this check, bench reports successful smokes against the wrong
530
+ # model (we found this by disk-forensicing: 8B-q4 directory didn't exist
531
+ # but bench still reported 8B-q4 smokes). Query /v1/models, parse the
532
+ # first entry's id, require the requested repo+quant appear as substring.
533
+ local loaded_model
534
+ loaded_model="$(curl -sm 3 "http://${VLM_HOST}:${VLM_PORT}/v1/models" 2>/dev/null \
535
+ | jq -r '.data[0].id // ""' 2>/dev/null)"
536
+ if [ -n "${loaded_model}" ]; then
537
+ # The requested model spec is "vendor/repo:quant"; the loaded id usually
538
+ # contains "vendor/repo" and the quant tag.
539
+ local model_no_slash="${model//\//_}"
540
+ local loaded_no_slash="${loaded_model//\//_}"
541
+ case "${loaded_no_slash}" in
542
+ *"${model_no_slash}"*) : ;; # exact subset — ok
543
+ *)
544
+ # Try the quant tag alone (some servers report only the quant).
545
+ local quant="${model##*:}"
546
+ case "${loaded_model}" in
547
+ *"${quant}"*) : ;;
548
+ *)
549
+ printf '{"event":"bench-model","model":"%s","status":"model-mismatch","loaded_as":"%s"}\n' \
550
+ "${model}" "${loaded_model}"
551
+ bench_fail=$((bench_fail + 1))
552
+ cmd_stop >/dev/null 2>&1 || true
553
+ continue
554
+ ;;
555
+ esac
556
+ ;;
557
+ esac
558
+ fi
559
+
560
+ # Run smokes for this model. CRITICAL: do NOT command-substitute the
561
+ # battery call — its stdout IS the smoke NDJSON, and we want those lines
562
+ # streamed live + SMOKE_PASS/SMOKE_FAIL incrementing in the parent shell
563
+ # (subshell would lose both). Capture the rc via the if/else branch.
564
+ local model_status
565
+ if _run_smoke_battery; then
566
+ model_status="ok"
567
+ else
568
+ model_status="partial" # smokes ran but some missed
569
+ fi
570
+ printf '{"event":"bench-model","model":"%s","status":"%s","pass":%d,"fail":%d}\n' \
571
+ "${model}" "${model_status}" "${SMOKE_PASS}" "${SMOKE_FAIL}"
572
+ if [ "${SMOKE_FAIL}" -eq 0 ]; then
573
+ bench_pass=$((bench_pass + 1))
574
+ else
575
+ bench_fail=$((bench_fail + 1))
576
+ fi
577
+
578
+ cmd_stop >/dev/null 2>&1 || true
579
+ done
580
+
581
+ printf '{"event":"bench-done","total_models":%d,"pass":%d,"fail":%d}\n' \
582
+ "${#models[@]}" "${bench_pass}" "${bench_fail}"
583
+ [ "${bench_fail}" -eq 0 ] || return 1
584
+ }
585
+
586
+ # --- install-env / uninstall-env (Phase 14+ auto-management) -----------
587
+ # Append two env exports (BROWSER_SKILL_VISION_FALLBACK=1 +
588
+ # BROWSER_SKILL_VISUAL_RESCUE_CMD=<bundled probe path>) to the user's shell
589
+ # init so Claude Code subprocesses inherit them. Idempotent — re-running is
590
+ # a no-op if the marked block already exists.
591
+ VLM_ENV_MARKER_BEGIN="# >>> browser-skill VLM auto-management (Path 3) >>>"
592
+ VLM_ENV_MARKER_END="# <<< browser-skill VLM auto-management <<<"
593
+
594
+ cmd_install_env() {
595
+ local target="${BROWSER_SKILL_INSTALL_SHELL_RC:-${HOME}/.zshrc}"
596
+ while [ "$#" -gt 0 ]; do
597
+ case "$1" in
598
+ --shell-rc) target="$2"; shift 2 ;;
599
+ --help|-h)
600
+ cat <<'IEUSAGE'
601
+ browser-vlm install-env [--shell-rc PATH]
602
+
603
+ Append the two env exports to your shell init so Claude Code subprocesses
604
+ inherit them. After running, open a new shell (or `source` the rc) to
605
+ activate. Idempotent — running twice is a no-op.
606
+
607
+ Defaults to ~/.zshrc. Override with --shell-rc /path/to/.bashrc (etc).
608
+ IEUSAGE
609
+ return 0 ;;
610
+ *) die "${EXIT_USAGE_ERROR}" "install-env: unknown flag '${1}'" ;;
611
+ esac
612
+ done
613
+ local probe_path
614
+ probe_path="$(cd "${SCRIPT_DIR}/lib" 2>/dev/null && pwd)/visual-rescue-default.sh"
615
+ [ -f "${probe_path}" ] \
616
+ || die "${EXIT_PREFLIGHT_FAILED}" "bundled probe missing at ${probe_path}"
617
+ if [ -f "${target}" ] && grep -qF "${VLM_ENV_MARKER_BEGIN}" "${target}" 2>/dev/null; then
618
+ ok "browser-skill env already installed in ${target} (no-op)"
619
+ return 0
620
+ fi
621
+ {
622
+ printf '\n%s\n' "${VLM_ENV_MARKER_BEGIN}"
623
+ printf '# Auto-added by `browser-vlm install-env`. Edit via `browser-vlm uninstall-env`.\n'
624
+ printf 'export BROWSER_SKILL_VISION_FALLBACK=1\n'
625
+ printf 'export BROWSER_SKILL_VISUAL_RESCUE_CMD=%q\n' "${probe_path}"
626
+ printf '%s\n' "${VLM_ENV_MARKER_END}"
627
+ } >> "${target}"
628
+ ok "added browser-skill env exports to ${target}"
629
+ ok "activate: 'source ${target}' or open a new shell"
630
+ }
631
+
632
+ cmd_uninstall_env() {
633
+ local target="${BROWSER_SKILL_INSTALL_SHELL_RC:-${HOME}/.zshrc}"
634
+ while [ "$#" -gt 0 ]; do
635
+ case "$1" in
636
+ --shell-rc) target="$2"; shift 2 ;;
637
+ --help|-h)
638
+ cat <<'UEUSAGE'
639
+ browser-vlm uninstall-env [--shell-rc PATH]
640
+
641
+ Remove the env-export block previously added by `install-env`. Idempotent
642
+ — running on a clean rc is a no-op.
643
+ UEUSAGE
644
+ return 0 ;;
645
+ *) die "${EXIT_USAGE_ERROR}" "uninstall-env: unknown flag '${1}'" ;;
646
+ esac
647
+ done
648
+ if [ ! -f "${target}" ] || ! grep -qF "${VLM_ENV_MARKER_BEGIN}" "${target}" 2>/dev/null; then
649
+ ok "no browser-skill env block found in ${target} (no-op)"
650
+ return 0
651
+ fi
652
+ local tmp
653
+ tmp="$(mktemp "${target}.uninstall.XXXXXX")"
654
+ awk -v b="${VLM_ENV_MARKER_BEGIN}" -v e="${VLM_ENV_MARKER_END}" '
655
+ index($0, b) { skip = 1; next }
656
+ skip && index($0, e) { skip = 0; next }
657
+ !skip
658
+ ' "${target}" > "${tmp}"
659
+ mv "${tmp}" "${target}"
660
+ ok "removed browser-skill env block from ${target}"
661
+ }
662
+
663
+ case "${1:-}" in
664
+ start) shift; cmd_start "$@" ;;
665
+ stop) shift; cmd_stop ;;
666
+ status) shift; cmd_status ;;
667
+ smoke) shift; cmd_smoke ;;
668
+ bench) shift; cmd_bench "$@" ;;
669
+ install-env) shift; cmd_install_env "$@" ;;
670
+ uninstall-env) shift; cmd_uninstall_env "$@" ;;
671
+ --help|-h|help|"")
672
+ cat <<'USAGE'
673
+ browser-vlm — local llama-server lifecycle wrapper (lean config)
674
+
675
+ Usage:
676
+ bash scripts/browser-vlm.sh start [--dry-run] # spawn llama-server in bg
677
+ bash scripts/browser-vlm.sh stop # kill running instance
678
+ bash scripts/browser-vlm.sh status # ping /health
679
+ bash scripts/browser-vlm.sh smoke # 4-smoke battery (text+vision)
680
+ bash scripts/browser-vlm.sh bench [MODEL...] # bench multiple models
681
+ bash scripts/browser-vlm.sh install-env # persist env exports to ~/.zshrc
682
+ bash scripts/browser-vlm.sh uninstall-env # remove the env block
683
+ bash scripts/browser-vlm.sh --help # this message
684
+
685
+ Lean defaults (override via BROWSER_SKILL_VLM_*; see top of script):
686
+ model: Qwen/Qwen3-VL-4B-Instruct-GGUF:Q4_K_M
687
+ endpoint: http://127.0.0.1:8080
688
+ ctx-size: 8192 (vs 175616 fat default)
689
+ parallel slots: 1 (vs 4 fat default)
690
+ threads: 4 (vs all P-cores fat default)
691
+ threads-batch: 6
692
+ cache-ram: 512 MiB (vs 8192 fat default)
693
+ n-gpu-layers: 99 (Metal offload — macOS default)
694
+
695
+ State files (mode 0600 inside ~/.browser-skill/):
696
+ vlm.pid pid of running llama-server
697
+ vlm.log stdout+stderr from llama-server
698
+
699
+ First launch downloads ~3.5 GB to ~/.cache/huggingface/hub/. Subsequent
700
+ launches start in ~5 s on M-series Macs.
701
+ USAGE
702
+ exit 0
703
+ ;;
704
+ *)
705
+ die "${EXIT_USAGE_ERROR}" "unknown subcommand '${1}' — see --help"
706
+ ;;
707
+ esac