github-router 0.3.45 → 0.3.66

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,15 +20,38 @@
20
20
 
21
21
  const NATIVE_HOST_NAME = "com.githubrouter.browser"
22
22
 
23
+ // Snapshot cache + invalidation lives in a sibling module so the
24
+ // matcher-cascade work in Phase 2 can consume it without dragging in
25
+ // the entire 1700-line background.js dispatcher.
26
+ import {
27
+ captureSnapshot,
28
+ invalidateSnapshot,
29
+ } from "./snapshot.js"
30
+ import { extractSnapshotCDP } from "./snapshot-cdp.js"
31
+
23
32
  // ---------------------------------------------------------------------
24
- // Navigation policy — list of URL patterns blocked from open / navigate.
25
- // Mirrored in src/lib/browser-mcp/policy.ts (defense in depth).
33
+ // Navigation policy — URL patterns this extension blocks at
34
+ // webNavigation.onBeforeNavigate. This list is INTENTIONALLY NARROWER
35
+ // than the bridge-side regex in src/lib/browser-mcp/policy.ts: the
36
+ // bridge regex only fires for tool-initiated nav (browser_open_tab /
37
+ // browser_navigate) so it can safely block `extensions` without
38
+ // affecting the human user, while THIS regex fires for user-typed URL
39
+ // bar nav too and must preserve human access to chrome://extensions /
40
+ // edge://extensions (needed to reload this extension after package
41
+ // updates).
26
42
  // ---------------------------------------------------------------------
27
43
 
44
+ // `extensions` is intentionally omitted from the extension-side regex —
45
+ // chrome.webNavigation.onBeforeNavigate fires for ALL top-level
46
+ // navigations including the user typing in the URL bar, so including it
47
+ // here would lock the user out of managing the very extension that
48
+ // loads this code (and prevent the reload arrow that auto-update falls
49
+ // back to). Bridge-side policy.ts keeps `extensions` in its regex,
50
+ // which is sufficient because the bridge regex only gates tool-
51
+ // initiated nav (browser_open_tab / browser_navigate).
28
52
  const BLOCKED_URL_RE =
29
- /^(chrome|edge|brave|opera|vivaldi):\/\/(settings|preferences|extensions|policy|management|password|flags|flag-descriptions)/i
30
- const BLOCKED_VIEW_SOURCE_RE =
31
- /^view-source:(chrome|edge):\/\/(settings|extensions)/i
53
+ /^(chrome|edge|brave|opera|vivaldi):\/\/(settings|preferences|policy|management|password|flags|flag-descriptions)/i
54
+ const BLOCKED_VIEW_SOURCE_RE = /^view-source:(chrome|edge):\/\/settings/i
32
55
 
33
56
  function isBlockedUrl(url) {
34
57
  if (typeof url !== "string") return false
@@ -170,40 +193,117 @@ async function toolScreenshot(args) {
170
193
  async function toolReadPage(args) {
171
194
  const tabId = typeof args.tabId === "number" ? args.tabId : undefined
172
195
  if (!tabId) throw new Error("browser_read_page: tabId is required")
196
+ const mode = args.mode === "full" ? "full" : "summary"
197
+ const refresh = args.refresh === true
198
+ // Phase 1c-CDP: try the CDP `Accessibility.getFullAXTree`-based
199
+ // extractor first. Better cross-origin iframe coverage, real
200
+ // platform-computed accessible names, AX-tree-flagged hidden /
201
+ // disabled state. Falls back to the legacy DOM walker when CDP
202
+ // attach fails (enterprise DeveloperToolsAvailability=2, DevTools
203
+ // already open on the tab, sandbox restriction). The fallback path
204
+ // produces a strict-subset shape so consumers don't have to branch.
205
+ const extractor = async (tId, ext) => {
206
+ try {
207
+ return await extractSnapshotCDP(tId, ext, {
208
+ attachDebugger: attachDebuggerOnce,
209
+ sendCommand: (id, method, params) => chrome.debugger.sendCommand({ tabId: id }, method, params),
210
+ })
211
+ } catch (err) {
212
+ const msg = err && err.message ? err.message : String(err)
213
+ console.warn(`[browser-mcp/snapshot] CDP extractor failed, falling back to legacy: ${msg}`)
214
+ return await extractSnapshotLegacy(tId, ext)
215
+ }
216
+ }
217
+ return captureSnapshot(tabId, { mode, refresh }, extractor)
218
+ }
219
+
220
+ /**
221
+ * Legacy `document.querySelectorAll`-based extractor. Stays as the
222
+ * default extractor until Phase 1b-CDP lands; will become the fallback
223
+ * path when CDP attach fails (enterprise policy, DevTools open on the
224
+ * tab, etc.). The implementation runs in the page world via
225
+ * chrome.scripting.executeScript and returns a PageSnapshot-shaped
226
+ * object that snapshot.captureSnapshot caches and returns to the
227
+ * caller.
228
+ */
229
+ async function extractSnapshotLegacy(tabId, opts) {
230
+ const mode = opts?.mode === "full" ? "full" : "summary"
173
231
  const [result] = await chrome.scripting.executeScript({
174
232
  target: { tabId },
175
- func: () => {
176
- // Element refs: every interactive element gets an id we return to
177
- // the caller; subsequent click/fill calls reference these refs
178
- // instead of brittle CSS selectors. Refs are stable for the
179
- // lifetime of a single read_page snapshot.
180
- const interactive = "a, button, input, select, textarea, [role='button'], [role='link'], [role='checkbox']"
181
- const els = Array.from(document.querySelectorAll(interactive))
182
- const elements = els.slice(0, 200).map((el, i) => {
183
- const ref = `e${i + 1}`
184
- el.setAttribute("data-gh-router-ref", ref)
185
- const rect = el.getBoundingClientRect()
186
- return {
187
- ref,
188
- role: el.getAttribute("role") || el.tagName.toLowerCase(),
189
- name:
190
- (el.getAttribute("aria-label") ||
191
- el.textContent ||
192
- el.getAttribute("value") ||
193
- el.getAttribute("placeholder") ||
194
- "")
195
- .trim()
196
- .slice(0, 200),
197
- bbox: [Math.round(rect.x), Math.round(rect.y), Math.round(rect.width), Math.round(rect.height)],
233
+ func: (mode) => {
234
+ // Stable ref attribution: every interactive element gets a
235
+ // data-gh-router-ref attribute the model uses for subsequent
236
+ // ref-based actions. Stable for the lifetime of one read_page.
237
+ //
238
+ // Traversal: descend into open shadow roots so web-component-heavy
239
+ // UIs (e.g. modern React apps with shadow encapsulation) surface
240
+ // their interactive elements. Cross-origin iframes are not reached
241
+ // from in-page script that needs CDP and is documented as a
242
+ // future enhancement.
243
+ const INTERACTIVE_ROLES = new Set([
244
+ "button",
245
+ "link",
246
+ "textbox",
247
+ "combobox",
248
+ "checkbox",
249
+ "radio",
250
+ "switch",
251
+ "tab",
252
+ "menuitem",
253
+ "option",
254
+ "slider",
255
+ "searchbox",
256
+ "spinbutton",
257
+ "treeitem",
258
+ ])
259
+ const INTERACTIVE_TAGS = new Set([
260
+ "a",
261
+ "button",
262
+ "input",
263
+ "select",
264
+ "textarea",
265
+ ])
266
+ function isInteractive(el) {
267
+ const role = el.getAttribute("role")
268
+ if (role && INTERACTIVE_ROLES.has(role)) return true
269
+ if (INTERACTIVE_TAGS.has(el.tagName.toLowerCase())) return true
270
+ if (el.hasAttribute("contenteditable") && el.getAttribute("contenteditable") !== "false") return true
271
+ const ti = el.getAttribute("tabindex")
272
+ if (ti !== null && Number.parseInt(ti, 10) >= 0) return true
273
+ return false
274
+ }
275
+ function nameOf(el) {
276
+ const labelledBy = el.getAttribute("aria-labelledby")
277
+ if (labelledBy) {
278
+ const labelEl = document.getElementById(labelledBy)
279
+ if (labelEl) return (labelEl.textContent || "").trim().slice(0, 200)
198
280
  }
199
- })
200
- // Page text: innerText is roughly what a user reads. Cap at
201
- // 256 KiB to keep the response tractable.
202
- const MAX = 256 * 1024
203
- let text = document.body ? document.body.innerText : ""
204
- if (text.length > MAX) text = text.slice(0, MAX)
205
- // Viewport metadata so the model can correlate CSS-px bbox to
206
- // device-px pixels in browser_screenshot (device_px = css_px * dpr).
281
+ return (
282
+ el.getAttribute("aria-label")
283
+ || el.getAttribute("title")
284
+ || (el.textContent || "")
285
+ || el.getAttribute("value")
286
+ || el.getAttribute("placeholder")
287
+ || el.getAttribute("alt")
288
+ || ""
289
+ ).trim().slice(0, 200)
290
+ }
291
+ function walkDeep(root, sink) {
292
+ // Walk every element under root, descending into open shadow
293
+ // roots. Closed shadow roots are intentionally opaque per the
294
+ // web spec; nothing we can do.
295
+ // NodeFilter.SHOW_ELEMENT === 1.
296
+ const walker = root.createTreeWalker
297
+ ? root.createTreeWalker(root, 1)
298
+ : document.createTreeWalker(root, 1)
299
+ let n
300
+ while ((n = walker.nextNode())) {
301
+ sink.push(n)
302
+ if (n.shadowRoot && n.shadowRoot.mode === "open") {
303
+ walkDeep(n.shadowRoot, sink)
304
+ }
305
+ }
306
+ }
207
307
  const viewport = {
208
308
  width: window.innerWidth,
209
309
  height: window.innerHeight,
@@ -211,8 +311,258 @@ async function toolReadPage(args) {
211
311
  scrollX: window.scrollX,
212
312
  scrollY: window.scrollY,
213
313
  }
214
- return { text, elements, viewport }
314
+ function inViewport(rect) {
315
+ return (
316
+ rect.bottom > 0
317
+ && rect.right > 0
318
+ && rect.top < viewport.height
319
+ && rect.left < viewport.width
320
+ && rect.width > 0
321
+ && rect.height > 0
322
+ )
323
+ }
324
+ const allElements = []
325
+ walkDeep(document, allElements)
326
+ const interactive = allElements.filter(isInteractive)
327
+ // Stable refs across snapshots: if an element already carries a
328
+ // data-gh-router-ref from a prior snapshot, keep it. New elements
329
+ // get the next unused counter. Result: ref `e42` refers to the
330
+ // SAME element across reads, so model can do `read_page → click(ref)
331
+ // → read_page` and the ref-to-element binding stays valid.
332
+ const usedRefs = new Set()
333
+ for (const el of interactive) {
334
+ const existing = el.getAttribute("data-gh-router-ref")
335
+ if (existing && /^e\d+$/.test(existing)) usedRefs.add(existing)
336
+ }
337
+ let nextRef = 1
338
+ function nextFreshRef() {
339
+ while (usedRefs.has(`e${nextRef}`)) nextRef++
340
+ const r = `e${nextRef}`
341
+ usedRefs.add(r)
342
+ nextRef++
343
+ return r
344
+ }
345
+ // Summary mode: viewport-visible only; drop nameless non-tag
346
+ // elements (a div with role="button" but no aria-label is noise).
347
+ // Full mode: keep everything, model asked for it.
348
+ const ELEMENT_CAP = 200
349
+ const LANDMARK_ROLES = new Set([
350
+ "dialog", "alertdialog", "region", "navigation", "main",
351
+ "form", "search", "complementary", "banner", "contentinfo",
352
+ ])
353
+ const LANDMARK_TAGS = new Set([
354
+ "dialog", "form", "nav", "main", "header", "footer", "aside", "section",
355
+ ])
356
+ // Pre-mint refs for landmark ancestors so child elements can
357
+ // cite parent refs without a second walk.
358
+ function landmarkRefsFor(el) {
359
+ const refs = []
360
+ let cur = el.parentElement
361
+ let depth = 0
362
+ while (cur && depth < 12 && refs.length < 4) {
363
+ const role = cur.getAttribute && cur.getAttribute("role")
364
+ const ctag = cur.tagName && cur.tagName.toLowerCase()
365
+ const isLandmark = (role && LANDMARK_ROLES.has(role)) || LANDMARK_TAGS.has(ctag)
366
+ if (isLandmark) {
367
+ let r = cur.getAttribute("data-gh-router-ref")
368
+ if (!r || !/^e\d+$/.test(r)) {
369
+ r = nextFreshRef()
370
+ cur.setAttribute("data-gh-router-ref", r)
371
+ }
372
+ refs.push(r)
373
+ }
374
+ cur = cur.parentElement
375
+ depth++
376
+ }
377
+ return refs
378
+ }
379
+ function stateFlagsFor(el, tag) {
380
+ const flags = {}
381
+ // disabled: prefer the property (more reliable than the attr
382
+ // for inputs / buttons; aria-disabled covers role=button divs).
383
+ if (el.disabled === true || el.getAttribute("aria-disabled") === "true") flags.disabled = true
384
+ if (el.checked === true) flags.checked = true
385
+ else if (el.indeterminate === true) flags.checked = "mixed"
386
+ else if (el.getAttribute("aria-checked") === "true") flags.checked = true
387
+ else if (el.getAttribute("aria-checked") === "mixed") flags.checked = "mixed"
388
+ const aria = (name) => el.getAttribute(name)
389
+ if (aria("aria-expanded") === "true") flags.expanded = true
390
+ else if (aria("aria-expanded") === "false") flags.expanded = false
391
+ if (el.selected === true || aria("aria-selected") === "true") flags.selected = true
392
+ if (aria("aria-pressed") === "true") flags.pressed = true
393
+ else if (aria("aria-pressed") === "false") flags.pressed = false
394
+ if (el.required === true || aria("aria-required") === "true") flags.required = true
395
+ if (el.readOnly === true || aria("aria-readonly") === "true") flags.readonly = true
396
+ if (aria("aria-invalid") === "true") flags.invalid = true
397
+ if (document.activeElement === el) flags.focused = true
398
+ // hidden: aria-hidden takes precedence; offsetParent === null
399
+ // covers display:none parents (NOT a reliable visibility check
400
+ // for fixed-position elements but a reasonable cheap signal).
401
+ if (aria("aria-hidden") === "true") flags.hidden = true
402
+ else if (tag !== "body" && el.offsetParent === null && getComputedStyle(el).position !== "fixed") {
403
+ flags.hidden = true
404
+ }
405
+ return flags
406
+ }
407
+ function inputExtrasFor(el, tag) {
408
+ const out = {}
409
+ if (tag === "input" || tag === "textarea" || tag === "select") {
410
+ const t = (el.type || "").toLowerCase()
411
+ if (t) out.inputType = t
412
+ }
413
+ const ph = el.placeholder || el.getAttribute("placeholder")
414
+ if (ph) out.placeholder = String(ph).slice(0, 200)
415
+ const ac = el.getAttribute("autocomplete")
416
+ if (ac) out.autocomplete = ac
417
+ // For inputs / textareas / select, value is the current user
418
+ // input. Bounded so a huge textarea doesn't bloat the snapshot.
419
+ if (typeof el.value === "string" && el.value.length > 0) {
420
+ out.value = el.value.slice(0, 200)
421
+ }
422
+ return out
423
+ }
424
+ function attrExtrasFor(el) {
425
+ // Surface raw attrs the matcher's L5 testid layer + L7 semantic
426
+ // heuristic want to see. Limited to a handful — we don't want
427
+ // to dump every attribute on every element.
428
+ const out = {}
429
+ const id = el.id
430
+ if (id) out.id = id
431
+ const testid = el.getAttribute("data-testid") || el.getAttribute("data-test-id")
432
+ || el.getAttribute("data-test") || el.getAttribute("data-qa")
433
+ if (testid) out.testid = testid
434
+ const nameAttr = el.getAttribute("name")
435
+ if (nameAttr) out.name_attr = nameAttr
436
+ const aria = el.getAttribute("aria-label")
437
+ if (aria) out.aria_label = aria
438
+ return out
439
+ }
440
+ const elements = []
441
+ for (const el of interactive) {
442
+ if (elements.length >= ELEMENT_CAP) break
443
+ const rect = el.getBoundingClientRect()
444
+ if (mode === "summary" && !inViewport(rect)) continue
445
+ const name = nameOf(el)
446
+ const tag = el.tagName.toLowerCase()
447
+ if (mode === "summary" && !name && !INTERACTIVE_TAGS.has(tag)) continue
448
+ let ref = el.getAttribute("data-gh-router-ref")
449
+ if (!ref || !/^e\d+$/.test(ref)) {
450
+ ref = nextFreshRef()
451
+ el.setAttribute("data-gh-router-ref", ref)
452
+ }
453
+ const entry = {
454
+ ref,
455
+ role: el.getAttribute("role") || tag,
456
+ tag,
457
+ bbox: [
458
+ Math.round(rect.x),
459
+ Math.round(rect.y),
460
+ Math.round(rect.width),
461
+ Math.round(rect.height),
462
+ ],
463
+ }
464
+ if (name) entry.name = name
465
+ // Inline state flags onto the entry. Each is omitted when
466
+ // false / default per the snapshot-types contract.
467
+ const flags = stateFlagsFor(el, tag)
468
+ for (const k of Object.keys(flags)) entry[k] = flags[k]
469
+ // Input-shaped extras (placeholder / inputType / value /
470
+ // autocomplete) — only present for input-shaped elements.
471
+ const inExtras = inputExtrasFor(el, tag)
472
+ if (inExtras.inputType) entry.inputType = inExtras.inputType
473
+ if (inExtras.placeholder) entry.placeholder = inExtras.placeholder
474
+ if (inExtras.autocomplete) entry.autocomplete = inExtras.autocomplete
475
+ if (inExtras.value) entry.value = inExtras.value
476
+ // Raw attribute extras for L5 testid + L7 semantic layers.
477
+ // Stored on a single `attrs` object to keep the top-level
478
+ // shape stable.
479
+ const attrExtras = attrExtrasFor(el)
480
+ if (Object.keys(attrExtras).length > 0) entry.attrs = attrExtras
481
+ // Landmark ancestry — up to 4 deep, dialog / form / nav / etc.
482
+ const landmarks = landmarkRefsFor(el)
483
+ if (landmarks.length > 0) entry.landmarks = landmarks
484
+ elements.push(entry)
485
+ }
486
+ // Text extraction.
487
+ // summary: walk text nodes whose parent is in the viewport; cap
488
+ // at 20 KB. The model sees what a user could read without
489
+ // scrolling. Off-screen content remains reachable via mode:"full".
490
+ // full: 256 KiB innerText cap (legacy behavior).
491
+ let text = ""
492
+ if (mode === "full") {
493
+ const MAX_FULL = 256 * 1024
494
+ text = document.body ? document.body.innerText : ""
495
+ if (text.length > MAX_FULL) text = text.slice(0, MAX_FULL)
496
+ } else {
497
+ const TEXT_CAP = 20 * 1024
498
+ const parts = []
499
+ let total = 0
500
+ const root = document.body || document.documentElement
501
+ if (root) {
502
+ const tw = document.createTreeWalker(root, 4) // NodeFilter.SHOW_TEXT === 4
503
+ let n
504
+ while ((n = tw.nextNode())) {
505
+ const parent = n.parentElement
506
+ if (!parent) continue
507
+ // Skip script/style content.
508
+ const ptag = parent.tagName ? parent.tagName.toLowerCase() : ""
509
+ if (ptag === "script" || ptag === "style" || ptag === "noscript") continue
510
+ const pr = parent.getBoundingClientRect()
511
+ if (!inViewport(pr)) continue
512
+ const t = (n.textContent || "").replace(/\s+/g, " ").trim()
513
+ if (!t) continue
514
+ if (total + t.length + 1 > TEXT_CAP) {
515
+ parts.push(t.slice(0, Math.max(0, TEXT_CAP - total)))
516
+ break
517
+ }
518
+ parts.push(t)
519
+ total += t.length + 1
520
+ }
521
+ }
522
+ text = parts.join("\n")
523
+ }
524
+ // visualSurfaces: canvas + svg of non-trivial size in the
525
+ // viewport. Signals "this region needs vision" to the lead model
526
+ // so it knows to call browser_screenshot / let browser_act
527
+ // auto-escalate when the text-based pickElement misses.
528
+ const visualSurfaces = []
529
+ const VS_MIN = 100
530
+ const canvasNodes = allElements.filter((el) => {
531
+ const t = el.tagName && el.tagName.toLowerCase()
532
+ return t === "canvas" || t === "svg"
533
+ })
534
+ for (const el of canvasNodes) {
535
+ const rect = el.getBoundingClientRect()
536
+ if (rect.width < VS_MIN || rect.height < VS_MIN) continue
537
+ if (!inViewport(rect)) continue
538
+ let ref = el.getAttribute("data-gh-router-ref")
539
+ if (!ref) {
540
+ ref = `v${visualSurfaces.length + 1}`
541
+ el.setAttribute("data-gh-router-ref", ref)
542
+ }
543
+ visualSurfaces.push({
544
+ ref,
545
+ kind: el.tagName.toLowerCase(),
546
+ bbox: [
547
+ Math.round(rect.x),
548
+ Math.round(rect.y),
549
+ Math.round(rect.width),
550
+ Math.round(rect.height),
551
+ ],
552
+ })
553
+ }
554
+ const out = {
555
+ mode,
556
+ url: window.location.href,
557
+ title: document.title,
558
+ text,
559
+ elements,
560
+ viewport,
561
+ }
562
+ if (visualSurfaces.length > 0) out.visualSurfaces = visualSurfaces
563
+ return out
215
564
  },
565
+ args: [mode],
216
566
  })
217
567
  if (!result || typeof result.result !== "object") {
218
568
  throw new Error("browser_read_page: scripting.executeScript returned nothing")
@@ -232,36 +582,93 @@ async function toolClick(args) {
232
582
  const clickCount = typeof args.clickCount === "number" ? args.clickCount : 1
233
583
  if (!tabId) throw new Error("browser_click: tabId is required")
234
584
  if (!ref && !selector) throw new Error("browser_click: ref or selector is required")
235
- const before = await chrome.tabs.get(tabId)
236
- const urlBefore = before.url
237
- const [result] = await chrome.scripting.executeScript({
238
- target: { tabId },
239
- func: (ref, selector, button, clickCount) => {
240
- const sel = ref ? `[data-gh-router-ref="${ref}"]` : selector
241
- const el = document.querySelector(sel)
242
- if (!el) return { ok: false, error: `element not found: ${sel}` }
243
- // Use native .click() for left-button (handles default action,
244
- // form submission, etc); MouseEvent for right-click context menus.
245
- if (button === "right") {
246
- for (let i = 0; i < clickCount; i++) {
247
- el.dispatchEvent(new MouseEvent("contextmenu", { bubbles: true, cancelable: true, button: 2 }))
585
+ // Subscribe to nav events BEFORE dispatching the click so a fast
586
+ // click nav transition can't race past us. Cleanup runs in
587
+ // finally so an executeScript throw doesn't leak listeners.
588
+ const navState = watchTabNavigation(tabId)
589
+ try {
590
+ const [result] = await chrome.scripting.executeScript({
591
+ target: { tabId },
592
+ func: (ref, selector, button, clickCount) => {
593
+ const sel = ref ? `[data-gh-router-ref="${ref}"]` : selector
594
+ const el = document.querySelector(sel)
595
+ if (!el) return { ok: false, error: `element not found: ${sel}` }
596
+ // Use native .click() for left-button (handles default action,
597
+ // form submission, etc); MouseEvent for right-click context menus.
598
+ if (button === "right") {
599
+ for (let i = 0; i < clickCount; i++) {
600
+ el.dispatchEvent(new MouseEvent("contextmenu", { bubbles: true, cancelable: true, button: 2 }))
601
+ }
602
+ } else {
603
+ for (let i = 0; i < clickCount; i++) el.click()
248
604
  }
249
- } else {
250
- for (let i = 0; i < clickCount; i++) el.click()
251
- }
252
- return { ok: true }
253
- },
254
- args: [ref, selector, button, clickCount],
255
- })
256
- if (!result || !result.result || !result.result.ok) {
257
- throw new Error(`browser_click: ${result?.result?.error ?? "execution failed"}`)
605
+ return { ok: true }
606
+ },
607
+ args: [ref, selector, button, clickCount],
608
+ })
609
+ if (!result || !result.result || !result.result.ok) {
610
+ throw new Error(`browser_click: ${result?.result?.error ?? "execution failed"}`)
611
+ }
612
+ // Accurate navigated detection via webNavigation events (replaces the
613
+ // old 300ms URL-poll which missed slow nav and reported navigated:false
614
+ // for clicks that DID navigate but took longer to commit). Wait up to
615
+ // ~150ms for onBeforeNavigate to fire; if it does, then wait up to
616
+ // ~5s for onCommitted to land. If onBeforeNavigate never fires, no
617
+ // navigation was triggered — return immediately, no wasted latency.
618
+ const navigated = await navState.promise
619
+ return { ok: true, navigated }
620
+ } finally {
621
+ navState.cleanup()
622
+ }
623
+ }
624
+
625
+ /**
626
+ * Pre-subscribe to chrome.webNavigation events for an upcoming click
627
+ * on a tab. Returns a {promise, cleanup} pair. The caller fires the
628
+ * click AFTER calling this so the listener can never miss the
629
+ * onBeforeNavigate that the click triggers.
630
+ *
631
+ * Promise resolves to:
632
+ * - true when onCommitted fires for tabId+frameId 0 within ~5s of
633
+ * an onBeforeNavigate also firing on that tab/frame, OR
634
+ * - false when onBeforeNavigate doesn't fire within ~150ms post-call
635
+ * (= no nav triggered by the click).
636
+ *
637
+ * cleanup() removes both listeners — caller MUST invoke from a finally
638
+ * block to avoid leaking event subscriptions on errors.
639
+ */
640
+ function watchTabNavigation(tabId) {
641
+ const NO_NAV_MS = 150
642
+ const COMMIT_MS = 5000
643
+ let onBefore
644
+ let onCommitted
645
+ let resolved = false
646
+ let noNavTimer
647
+ let commitTimer
648
+ const cleanup = () => {
649
+ try { if (onBefore) chrome.webNavigation.onBeforeNavigate.removeListener(onBefore) } catch { /* ignore */ }
650
+ try { if (onCommitted) chrome.webNavigation.onCommitted.removeListener(onCommitted) } catch { /* ignore */ }
651
+ if (noNavTimer) clearTimeout(noNavTimer)
652
+ if (commitTimer) clearTimeout(commitTimer)
258
653
  }
259
- // Brief settle window so clicks that trigger navigation surface in
260
- // the response. 300ms is enough to catch immediate-redirect clicks
261
- // without significantly slowing the tool's tail latency.
262
- await sleep(300)
263
- const after = await chrome.tabs.get(tabId)
264
- return { ok: true, navigated: after.url !== urlBefore }
654
+ const promise = new Promise((resolve) => {
655
+ const settle = (v) => { if (!resolved) { resolved = true; resolve(v) } }
656
+ onCommitted = (details) => {
657
+ if (details.tabId === tabId && details.frameId === 0) settle(true)
658
+ }
659
+ onBefore = (details) => {
660
+ if (details.tabId !== tabId || details.frameId !== 0) return
661
+ // Nav started; switch from "did we get a nav at all" to "wait
662
+ // for commit". If commit doesn't land in COMMIT_MS, assume the
663
+ // nav stuck or was cancelled and report true (a nav DID start).
664
+ if (noNavTimer) { clearTimeout(noNavTimer); noNavTimer = undefined }
665
+ commitTimer = setTimeout(() => settle(true), COMMIT_MS)
666
+ }
667
+ chrome.webNavigation.onBeforeNavigate.addListener(onBefore)
668
+ chrome.webNavigation.onCommitted.addListener(onCommitted)
669
+ noNavTimer = setTimeout(() => settle(false), NO_NAV_MS)
670
+ })
671
+ return { promise, cleanup }
265
672
  }
266
673
 
267
674
  async function toolFill(args) {
@@ -1289,9 +1696,29 @@ async function attachDebuggerOnce(tabId, opts) {
1289
1696
  networkBuffers.set(tabId, [])
1290
1697
  await chrome.debugger.sendCommand({ tabId }, "Network.enable")
1291
1698
  }
1699
+ // CDP a11y-tree extraction needs DOM + Page + Accessibility
1700
+ // domains enabled. We track them in a single Set so a second
1701
+ // captureSnapshot call on the same tab is a no-op.
1702
+ if (opts?.accessibility && !axDomainsEnabledTabs.has(tabId)) {
1703
+ axDomainsEnabledTabs.add(tabId)
1704
+ try {
1705
+ await chrome.debugger.sendCommand({ tabId }, "DOM.enable")
1706
+ await chrome.debugger.sendCommand({ tabId }, "Page.enable")
1707
+ await chrome.debugger.sendCommand({ tabId }, "Accessibility.enable")
1708
+ } catch (err) {
1709
+ // Roll back the tracking flag so the next call retries.
1710
+ axDomainsEnabledTabs.delete(tabId)
1711
+ throw err
1712
+ }
1713
+ }
1292
1714
  })
1293
1715
  }
1294
1716
 
1717
+ // Track which tabs have the CDP a11y domains enabled (DOM + Page +
1718
+ // Accessibility). Cleared on debugger.onDetach / tabs.onRemoved
1719
+ // alongside the other per-tab state.
1720
+ const axDomainsEnabledTabs = new Set()
1721
+
1295
1722
  chrome.debugger.onEvent.addListener((source, method, params) => {
1296
1723
  const tabId = source.tabId
1297
1724
  if (typeof tabId !== "number") return
@@ -1323,6 +1750,12 @@ chrome.debugger.onDetach.addListener((source) => {
1323
1750
  consoleBuffers.delete(source.tabId)
1324
1751
  networkBuffers.delete(source.tabId)
1325
1752
  tabInputLockTails.delete(source.tabId)
1753
+ axDomainsEnabledTabs.delete(source.tabId)
1754
+ // Snapshot cache: CDP-written refs survive a detach (they're DOM
1755
+ // attributes, not CDP state), but bbox/AXNode IDs become unreliable
1756
+ // because re-attach needs a fresh DOM.enable handshake. Safer to
1757
+ // invalidate and re-capture on next read.
1758
+ invalidateSnapshot(source.tabId, "debugger-detach")
1326
1759
  }
1327
1760
  })
1328
1761
 
@@ -1337,6 +1770,22 @@ chrome.tabs.onRemoved.addListener((tabId) => {
1337
1770
  consoleBuffers.delete(tabId)
1338
1771
  networkBuffers.delete(tabId)
1339
1772
  tabInputLockTails.delete(tabId)
1773
+ axDomainsEnabledTabs.delete(tabId)
1774
+ invalidateSnapshot(tabId, "tab-closed")
1775
+ })
1776
+
1777
+ // Snapshot cache invalidation on top-frame navigation. The legacy ref
1778
+ // scheme (data-gh-router-ref DOM attribute) does NOT survive a fresh
1779
+ // document load, so a stale snapshot would return refs that resolve
1780
+ // to nothing. Invalidate so the next read captures the new document.
1781
+ // We intentionally do NOT invalidate on child-frame navigations — a
1782
+ // click inside an iframe shouldn't bust the whole-tab snapshot. Phase
1783
+ // 1b-CDP will revisit this when cross-origin iframe ref attribution
1784
+ // changes the trade-off.
1785
+ chrome.webNavigation.onCommitted.addListener((details) => {
1786
+ if (details.frameId === 0 && typeof details.tabId === "number") {
1787
+ invalidateSnapshot(details.tabId, "navigation")
1788
+ }
1340
1789
  })
1341
1790
 
1342
1791
  async function toolConsoleLogs(args) {
@@ -1362,6 +1811,128 @@ async function toolNetworkLog(args) {
1362
1811
  return { entries: drained }
1363
1812
  }
1364
1813
 
1814
+ // ---------------------------------------------------------------------
1815
+ // Bot-challenge detection (Phase 4 auto-detect)
1816
+ // ---------------------------------------------------------------------
1817
+ // Listens to chrome.webRequest.onHeadersReceived for response-header
1818
+ // fingerprints of major bot-protection vendors. On match: post a
1819
+ // `__botDetected__` control frame to the bridge. The bridge tracks
1820
+ // which tabs are flagged and the proxy dispatcher consults that state
1821
+ // via /health to inject humanlike pacing for paced tabs.
1822
+ //
1823
+ // Signature confidence tiers:
1824
+ // HIGH (per-vendor, single-hit enables): cf-ray + 403/503, x-dd-b,
1825
+ // x-px-block, x-px-uuid, x-incapsula header on 403.
1826
+ // MEDIUM (cookie / generic — deferred to v2): _abck=*~-1~ cookie,
1827
+ // burst of 403/429 across 5 s window.
1828
+ //
1829
+ // False-positive guard: only fires when we actually own the
1830
+ // connection to the bridge (`nativePort` set). No phantom signals
1831
+ // during SW startup before the port opens.
1832
+
1833
+ const BOT_DETECTION_VENDORS = {
1834
+ cloudflare: (resp) => {
1835
+ if (resp.statusCode !== 403 && resp.statusCode !== 503) return null
1836
+ const cfRay = headerValue(resp.responseHeaders, "cf-ray")
1837
+ return cfRay ? { signal: "cf-ray + " + resp.statusCode, evidence: cfRay.slice(0, 60) } : null
1838
+ },
1839
+ datadome: (resp) => {
1840
+ const dd = headerValue(resp.responseHeaders, "x-dd-b")
1841
+ return dd === "1" ? { signal: "x-dd-b=1", evidence: "" } : null
1842
+ },
1843
+ perimeterx: (resp) => {
1844
+ if (headerValue(resp.responseHeaders, "x-px-block") === "1") {
1845
+ return { signal: "x-px-block=1", evidence: "" }
1846
+ }
1847
+ const pxUuid = headerValue(resp.responseHeaders, "x-px-uuid")
1848
+ if (pxUuid && (resp.statusCode === 403 || resp.statusCode === 429)) {
1849
+ return { signal: "x-px-uuid + " + resp.statusCode, evidence: pxUuid.slice(0, 36) }
1850
+ }
1851
+ return null
1852
+ },
1853
+ imperva: (resp) => {
1854
+ if (resp.statusCode !== 403) return null
1855
+ const iinfo = headerValue(resp.responseHeaders, "x-iinfo")
1856
+ return iinfo ? { signal: "x-iinfo + 403", evidence: iinfo.slice(0, 40) } : null
1857
+ },
1858
+ }
1859
+
1860
+ function headerValue(headers, name) {
1861
+ if (!Array.isArray(headers)) return undefined
1862
+ const lower = name.toLowerCase()
1863
+ for (const h of headers) {
1864
+ if (h.name && h.name.toLowerCase() === lower) return h.value
1865
+ }
1866
+ return undefined
1867
+ }
1868
+
1869
+ // Per-tab deduplication: a single vendor's signature firing repeatedly
1870
+ // on a tab should emit ONE control frame, not one per response. Bridge
1871
+ // already de-dupes by tabId on its side; we de-dupe here too to keep
1872
+ // the wire quiet.
1873
+ const detectedVendorsByTab = new Map() // tabId -> Set<vendor>
1874
+
1875
+ function emitBotDetected(tabId, vendor, signal, evidence) {
1876
+ if (typeof tabId !== "number" || tabId < 0) return
1877
+ if (!nativePort) return
1878
+ let seen = detectedVendorsByTab.get(tabId)
1879
+ if (!seen) {
1880
+ seen = new Set()
1881
+ detectedVendorsByTab.set(tabId, seen)
1882
+ }
1883
+ if (seen.has(vendor)) return
1884
+ seen.add(vendor)
1885
+ try {
1886
+ nativePort.postMessage({
1887
+ type: "__botDetected__",
1888
+ tabId,
1889
+ vendor,
1890
+ signal,
1891
+ evidence,
1892
+ ts: Date.now(),
1893
+ })
1894
+ } catch (err) {
1895
+ console.warn("[browser-bridge/bot-detect] post failed:", err)
1896
+ }
1897
+ }
1898
+
1899
+ // MAIN frame only — sub-resource 403s on tracking pixels are common
1900
+ // noise. Vendor blocks always land on the main document request.
1901
+ try {
1902
+ chrome.webRequest.onHeadersReceived.addListener(
1903
+ (details) => {
1904
+ try {
1905
+ for (const [vendor, probe] of Object.entries(BOT_DETECTION_VENDORS)) {
1906
+ const hit = probe(details)
1907
+ if (hit) {
1908
+ emitBotDetected(details.tabId, vendor, hit.signal, hit.evidence)
1909
+ break
1910
+ }
1911
+ }
1912
+ } catch (err) {
1913
+ console.warn("[browser-bridge/bot-detect] probe crashed:", err)
1914
+ }
1915
+ },
1916
+ { urls: ["<all_urls>"], types: ["main_frame"] },
1917
+ ["responseHeaders"],
1918
+ )
1919
+ } catch (err) {
1920
+ // webRequest permission may not be granted on some enterprise
1921
+ // policies; auto-detect just no-ops in that case.
1922
+ console.warn("[browser-bridge/bot-detect] webRequest listener registration failed:", err)
1923
+ }
1924
+
1925
+ // Cleanup: clear vendor dedup state on navigation + tab close so a
1926
+ // new document gets a fresh detection window.
1927
+ chrome.webNavigation.onCommitted.addListener((details) => {
1928
+ if (details.frameId === 0 && typeof details.tabId === "number") {
1929
+ detectedVendorsByTab.delete(details.tabId)
1930
+ }
1931
+ })
1932
+ chrome.tabs.onRemoved.addListener((tabId) => {
1933
+ detectedVendorsByTab.delete(tabId)
1934
+ })
1935
+
1365
1936
  const TOOL_HANDLERS = {
1366
1937
  __ping__: () => ({
1367
1938
  pong: true,
@@ -1438,11 +2009,39 @@ function connectBridge() {
1438
2009
  nativePort = undefined
1439
2010
  })
1440
2011
  nativePort = port
2012
+ // Hello frame — lets the bridge associate this connection with a
2013
+ // version. Pre-flight on the proxy side compares this against the
2014
+ // version stamped into dist/browser-ext/manifest.json at build, and
2015
+ // triggers an auto-reload (via __reload__ control frame) when the
2016
+ // package has been updated but the loaded extension is stale.
2017
+ try {
2018
+ port.postMessage({
2019
+ type: "__hello__",
2020
+ version: chrome.runtime.getManifest().version,
2021
+ })
2022
+ } catch (err) {
2023
+ console.warn("[browser-bridge] hello frame failed:", err)
2024
+ }
1441
2025
  return port
1442
2026
  }
1443
2027
 
1444
2028
  async function handleBridgeRequest(req, port) {
1445
- if (!req || typeof req.id !== "string" || typeof req.tool !== "string") return
2029
+ if (!req) return
2030
+ // Control frames — not regular tool dispatches. The bridge sends
2031
+ // these out-of-band; the {id, tool, args} shape doesn't apply.
2032
+ if (req.type === "__reload__") {
2033
+ // chrome.runtime.reload terminates this service worker and starts
2034
+ // a fresh one that re-reads on-disk files. Used by the proxy's
2035
+ // pre-flight when the loaded extension version doesn't match the
2036
+ // version stamped into dist/browser-ext/manifest.json.
2037
+ try {
2038
+ chrome.runtime.reload()
2039
+ } catch (err) {
2040
+ console.warn("[browser-bridge] reload failed:", err)
2041
+ }
2042
+ return
2043
+ }
2044
+ if (typeof req.id !== "string" || typeof req.tool !== "string") return
1446
2045
  const handler = TOOL_HANDLERS[req.tool]
1447
2046
  if (!handler) {
1448
2047
  port.postMessage({ id: req.id, ok: false, error: `unknown tool: ${req.tool}`, code: "unknown_tool" })
@@ -1451,11 +2050,43 @@ async function handleBridgeRequest(req, port) {
1451
2050
  try {
1452
2051
  const data = await handler(req.args || {})
1453
2052
  port.postMessage({ id: req.id, ok: true, data })
2053
+ // Snapshot cache invalidation for mutating actions. The matcher
2054
+ // cascade (Phase 2) dispatches against cached snapshots; a
2055
+ // successful click / fill / type / etc. likely changed the page,
2056
+ // so the cached element list is stale. Invalidate by tabId from
2057
+ // the request args; tools that don't carry a tabId (open_tab on
2058
+ // create-path, list_tabs) are not page-mutating per-tab so they
2059
+ // skip this.
2060
+ if (MUTATES_PAGE.has(req.tool)) {
2061
+ const tabId = typeof req.args?.tabId === "number" ? req.args.tabId : undefined
2062
+ if (typeof tabId === "number") {
2063
+ invalidateSnapshot(tabId, `mutation:${req.tool}`)
2064
+ }
2065
+ }
1454
2066
  } catch (err) {
1455
2067
  port.postMessage({ id: req.id, ok: false, error: err && err.message ? err.message : String(err) })
1456
2068
  }
1457
2069
  }
1458
2070
 
2071
+ // Tools whose successful execution likely mutates the page's DOM,
2072
+ // triggering snapshot-cache invalidation for the tabId in args. Kept
2073
+ // as a Set rather than per-tool flags so adding a new mutating tool
2074
+ // is one line. Conservative: tools listed here MAY not mutate (e.g.
2075
+ // click on a disabled button is a no-op); the cost of a spurious
2076
+ // invalidate is one extra capture on next read, vs the cost of a
2077
+ // stale snapshot which is silent dispatch against a vanished ref.
2078
+ const MUTATES_PAGE = new Set([
2079
+ "browser_click",
2080
+ "browser_fill",
2081
+ "browser_type",
2082
+ "browser_keyboard",
2083
+ "browser_scroll",
2084
+ "browser_mouse",
2085
+ "browser_drag",
2086
+ "browser_navigate",
2087
+ "browser_eval_js",
2088
+ ])
2089
+
1459
2090
  chrome.runtime.onInstalled.addListener(() => {
1460
2091
  try { connectBridge() } catch (err) { console.warn("[browser-bridge] onInstalled connect failed:", err) }
1461
2092
  })
@@ -1483,17 +2114,17 @@ chrome.tabs.onUpdated.addListener(() => {
1483
2114
  try { connectBridge() } catch (err) { console.warn("[browser-bridge] onUpdated connect failed:", err) }
1484
2115
  })
1485
2116
 
1486
- // Defense in depth webNavigation listener catches in-page-initiated
1487
- // navigations (JS-driven redirects, meta-refresh, anchor clicks the
1488
- // model didn't go through browser_navigate for). Tool-initiated paths
1489
- // already pre-check via isBlockedUrl() / the bridge-layer policy.ts,
1490
- // so this is the safety net for navigations the bridge can't see.
1491
- //
1492
- // On match: cancel the navigation by routing the tab back to
1493
- // about:blank, AND log a console.error so browser_console_logs can
1494
- // surface "the model tried to navigate to a blocked URL" on the next
1495
- // drain. The cancel happens via chrome.tabs.update there's no
1496
- // onBeforeNavigate "cancel" API in MV3.
2117
+ // webNavigation.onBeforeNavigate fires for ALL top-level navigations
2118
+ // user-typed URL bar entries AND in-page-initiated nav (JS redirect,
2119
+ // meta-refresh, anchor clicks). It does NOT expose transitionType, so
2120
+ // we can't cheaply distinguish initiator at this stage. Consequence:
2121
+ // every URL in BLOCKED_URL_RE is unreachable when this extension is
2122
+ // enabled, including for the human user. `extensions` is deliberately
2123
+ // excluded from BLOCKED_URL_RE to preserve user access to the page
2124
+ // they need to manage this extension; bridge-side policy.ts still
2125
+ // rejects tool-initiated nav there. On match: route the tab back to
2126
+ // about:blank (no onBeforeNavigate cancel API in MV3) and log a
2127
+ // console.error so browser_console_logs can surface it on next drain.
1497
2128
  chrome.webNavigation.onBeforeNavigate.addListener((details) => {
1498
2129
  if (details.frameId !== 0) return // only top-level frame
1499
2130
  if (isBlockedUrl(details.url)) {