github-router 0.3.41 → 0.3.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -152,6 +152,14 @@ async function toolScreenshot(args) {
152
152
  await sleep(150)
153
153
  }
154
154
  }
155
+ // Both this API and CDP Page.captureScreenshot require the browser
156
+ // to have a real OS-level rendering surface. On Chrome-for-Testing
157
+ // launched in plain headed mode without --headless=new, no such
158
+ // surface exists and either path hangs indefinitely — the Playwright
159
+ // E2E harness passes --headless=new in its args list for exactly this
160
+ // reason. Real Chrome with a visible window has a surface and works
161
+ // fine. If you're driving Chrome-for-Testing programmatically and
162
+ // need screenshots, launch with `--headless=new`.
155
163
  const dataUrl = await chrome.tabs.captureVisibleTab(windowId, { format })
156
164
  // dataUrl: "data:image/png;base64,...."
157
165
  const m = /^data:([^;]+);base64,(.*)$/.exec(dataUrl)
@@ -194,7 +202,16 @@ async function toolReadPage(args) {
194
202
  const MAX = 256 * 1024
195
203
  let text = document.body ? document.body.innerText : ""
196
204
  if (text.length > MAX) text = text.slice(0, MAX)
197
- return { text, elements }
205
+ // Viewport metadata so the model can correlate CSS-px bbox to
206
+ // device-px pixels in browser_screenshot (device_px = css_px * dpr).
207
+ const viewport = {
208
+ width: window.innerWidth,
209
+ height: window.innerHeight,
210
+ devicePixelRatio: window.devicePixelRatio,
211
+ scrollX: window.scrollX,
212
+ scrollY: window.scrollY,
213
+ }
214
+ return { text, elements, viewport }
198
215
  },
199
216
  })
200
217
  if (!result || typeof result.result !== "object") {
@@ -304,13 +321,52 @@ async function toolFill(args) {
304
321
  }
305
322
 
306
323
  async function toolScroll(args) {
307
- const tabId = typeof args.tabId === "number" ? args.tabId : undefined
324
+ const tabId = args.tabId
308
325
  const target = args.target
309
- const pixels = typeof args.pixels === "number" ? args.pixels : 0
326
+ assertTabId("browser_scroll", tabId)
327
+ const pixels = Number.isFinite(args.pixels) ? args.pixels : 0
310
328
  const ref = typeof args.ref === "string" ? args.ref : null
311
- if (!tabId) throw new Error("browser_scroll: tabId is required")
312
- if (!["top", "bottom", "pixels", "element"].includes(target)) {
313
- throw new Error(`browser_scroll: target must be top|bottom|pixels|element, got ${String(target)}`)
329
+ if (!["top", "bottom", "pixels", "element", "at-pointer"].includes(target)) {
330
+ throw new Error(`browser_scroll: target must be top|bottom|pixels|element|at-pointer, got ${String(target)}`)
331
+ }
332
+ if (target === "at-pointer") {
333
+ // Wheel scroll a sub-region at a pointer location. Necessary for
334
+ // chat windows / infinite-scroll lists / modal bodies that have
335
+ // their own scroll container and ignore window.scrollTo. The wheel
336
+ // event bubbles through the scroll-container ancestor at the
337
+ // pointer location, so positioning the cursor on the right region
338
+ // is what makes it scroll instead of the outer window.
339
+ const selector = typeof args.selector === "string" ? args.selector : null
340
+ const x = Number.isFinite(args.x) ? args.x : undefined
341
+ const y = Number.isFinite(args.y) ? args.y : undefined
342
+ assertSingleTarget("browser_scroll(at-pointer)", ref, selector, x, y)
343
+ const deltaX = clampNum(Number.isFinite(args.deltaX) ? args.deltaX : 0, -10_000, 10_000)
344
+ const deltaY = clampNum(Number.isFinite(args.deltaY) ? args.deltaY : 0, -10_000, 10_000)
345
+ if (deltaX === 0 && deltaY === 0) {
346
+ throw new Error("browser_scroll(at-pointer): at least one of deltaX / deltaY must be non-zero")
347
+ }
348
+ const force = args.force === true
349
+ const pos = await resolveMouseTarget(tabId, ref, selector, x, y)
350
+ if (pos.hitTest && !pos.hitTest.isTarget && !force) {
351
+ throw new Error(`target_obscured: topmost is ${pos.hitTest.topmost || pos.hitTest.note}`)
352
+ }
353
+ return await withTabInputLock(tabId, async () => {
354
+ await attachDebuggerOnce(tabId)
355
+ // Position the cursor first so the wheel event lands on the
356
+ // right scroll-container ancestor.
357
+ await chrome.debugger.sendCommand({ tabId }, "Input.dispatchMouseEvent", {
358
+ type: "mouseMoved",
359
+ x: pos.x, y: pos.y,
360
+ button: "none", buttons: 0, modifiers: 0, pointerType: "mouse",
361
+ })
362
+ await chrome.debugger.sendCommand({ tabId }, "Input.dispatchMouseEvent", {
363
+ type: "mouseWheel",
364
+ x: pos.x, y: pos.y,
365
+ deltaX, deltaY,
366
+ button: "none", buttons: 0, modifiers: 0, pointerType: "mouse",
367
+ })
368
+ return { ok: true, scrolled: { x: pos.x, y: pos.y, deltaX, deltaY } }
369
+ })
314
370
  }
315
371
  const [result] = await chrome.scripting.executeScript({
316
372
  target: { tabId },
@@ -354,22 +410,702 @@ async function toolKeyboard(args) {
354
410
  // attachment. The attach stays for the tab's lifetime — chrome's
355
411
  // "is being controlled" banner is the visible cost, accepted in
356
412
  // exchange for cross-tool composability.
357
- await attachDebuggerOnce(tabId)
358
- const winVK = key.length === 1 ? key.toUpperCase().charCodeAt(0) : 0
359
- await chrome.debugger.sendCommand({ tabId }, "Input.dispatchKeyEvent", {
360
- type: "keyDown",
361
- modifiers: bits,
362
- key,
363
- text: key.length === 1 ? key : undefined,
364
- windowsVirtualKeyCode: winVK,
413
+ return await withTabInputLock(tabId, async () => {
414
+ await attachDebuggerOnce(tabId)
415
+ const winVK = key.length === 1 ? key.toUpperCase().charCodeAt(0) : 0
416
+ await chrome.debugger.sendCommand({ tabId }, "Input.dispatchKeyEvent", {
417
+ type: "keyDown",
418
+ modifiers: bits,
419
+ key,
420
+ text: key.length === 1 ? key : undefined,
421
+ windowsVirtualKeyCode: winVK,
422
+ })
423
+ await chrome.debugger.sendCommand({ tabId }, "Input.dispatchKeyEvent", {
424
+ type: "keyUp",
425
+ modifiers: bits,
426
+ key,
427
+ windowsVirtualKeyCode: winVK,
428
+ })
429
+ return { ok: true }
365
430
  })
366
- await chrome.debugger.sendCommand({ tabId }, "Input.dispatchKeyEvent", {
367
- type: "keyUp",
368
- modifiers: bits,
369
- key,
370
- windowsVirtualKeyCode: winVK,
431
+ }
432
+
433
+ // ---------------------------------------------------------------------
434
+ // Humanlike input v2: browser_mouse, browser_drag, browser_type, browser_locate
435
+ // ---------------------------------------------------------------------
436
+ // All four are CDP-driven (Input.dispatchMouseEvent / Input.dispatchKeyEvent),
437
+ // share the per-tab input mutex (withTabInputLock), and inherit the
438
+ // hardened attachDebuggerOnce. resolveMouseTarget centralises ref /
439
+ // selector / (x,y) → bbox-center resolution AND the elementFromPoint
440
+ // hit-test so all three coordinate-driven tools refuse to act on
441
+ // occluded targets by default (force:true bypass).
442
+
443
+ const BUTTON_BITS = { left: 1, right: 2, middle: 4 }
444
+
445
+ function clampNum(v, min, max) {
446
+ const n = typeof v === "number" ? v : Number(v)
447
+ if (!Number.isFinite(n)) return min
448
+ return Math.max(min, Math.min(max, n))
449
+ }
450
+
451
+ // Per-tab input mutex. CDP mouse / keyboard state is global per
452
+ // attachment, so two parallel browser_mouse / browser_drag / browser_type
453
+ // calls on the same tab would interleave and corrupt each other (one
454
+ // call's mouseMoved would land mid-drag of another). The global
455
+ // MAX_INFLIGHT_TOOLS_CALL=8 cap doesn't help — it's global, not per-tab.
456
+ // This mutex is per-tab, layered on top.
457
+ const tabInputLockTails = new Map() // tabId → Promise (tail of the lock chain)
458
+
459
+ // Wall-clock cap on how long ONE input call may hold its tab's mutex,
460
+ // passed per-call (each tool sizes its own cap). Acts as a deadlock
461
+ // release valve when an in-extension hang outlives the dispatcher's
462
+ // WS-side timeout — without this cap the lock would stay held forever
463
+ // (CDP commands don't abort when the dispatcher's WS disconnects).
464
+ //
465
+ // On wedge: we force-detach `chrome.debugger` for the tab AND bump the
466
+ // tab's input generation. The detach makes all in-flight `sendCommand`
467
+ // promises in the wedged fn() reject with "Debugger is not attached"
468
+ // — without this, the wedged fn could keep dispatching stale CDP
469
+ // events (e.g. a leftover `mouseReleased`) after the next caller has
470
+ // already taken the lock and started a fresh drag, corrupting it.
471
+ // `attachedTabs` is cleared so the next caller's `attachDebuggerOnce`
472
+ // re-attaches cleanly. Cost: per-tab `consoleBuffers` /
473
+ // `networkBuffers` are dropped (their backing CDP domain is no longer
474
+ // enabled); the next `browser_console_logs` / `browser_network_log`
475
+ // call re-`Runtime.enable` / `Network.enable` and starts capturing
476
+ // fresh. A loud console.warn surfaces the wedge to forensic readers.
477
+ //
478
+ // Default cap = 60s — comfortably covers mouse/drag/scroll/keyboard
479
+ // dispatcher maxMs (30s/30s/15s/10s) plus CDP overhead. `browser_type`
480
+ // passes a larger explicit cap to accommodate its legitimately-slow
481
+ // per-keystroke max (210s + grace).
482
+ const DEFAULT_TAB_INPUT_LOCK_HOLD_CAP_MS = 60_000
483
+ const TYPE_TAB_INPUT_LOCK_HOLD_CAP_MS = 240_000
484
+
485
+ const tabInputGenerations = new Map() // tabId → number, bumped each acquire + on wedge
486
+
487
+ async function withTabInputLock(tabId, fn, holdCapMs = DEFAULT_TAB_INPUT_LOCK_HOLD_CAP_MS) {
488
+ const previousTail = tabInputLockTails.get(tabId) || Promise.resolve()
489
+ let release
490
+ const myTurn = new Promise((r) => { release = r })
491
+ const newTail = previousTail.then(() => myTurn)
492
+ tabInputLockTails.set(tabId, newTail)
493
+ await previousTail
494
+ let timer
495
+ let wedged = false
496
+ try {
497
+ return await Promise.race([
498
+ fn(),
499
+ new Promise((_, reject) => {
500
+ timer = setTimeout(() => {
501
+ wedged = true
502
+ reject(new Error(
503
+ `input_lock_wedged: held > ${holdCapMs}ms on tabId=${tabId}; force-detached debugger to abort the stuck CDP call.`,
504
+ ))
505
+ }, holdCapMs)
506
+ }),
507
+ ])
508
+ } finally {
509
+ if (timer !== undefined) clearTimeout(timer)
510
+ if (wedged) {
511
+ console.warn(`[browser-bridge] tab ${tabId} input lock wedged past ${holdCapMs}ms — force-detaching debugger`)
512
+ // Force-detach so the wedged fn's pending sendCommand promises
513
+ // reject and any further CDP calls it queues fail too. Without
514
+ // this, stale events from the wedged call can interleave with
515
+ // the next caller and corrupt drags / mouse state.
516
+ try {
517
+ await chrome.debugger.detach({ tabId })
518
+ } catch {
519
+ // already detached / tab gone — fine
520
+ }
521
+ attachedTabs.delete(tabId)
522
+ // Buffers need re-enabling next time their domains attach.
523
+ consoleBuffers.delete(tabId)
524
+ networkBuffers.delete(tabId)
525
+ // Bump the generation so any wedged fn() that checks before
526
+ // its next CDP send (future tools may opt in) sees the stale
527
+ // marker and bails out early.
528
+ tabInputGenerations.set(tabId, (tabInputGenerations.get(tabId) || 0) + 1)
529
+ }
530
+ release()
531
+ // GC the Map entry only if no later caller chained on top of us.
532
+ // If they did, the tail has been replaced; leave it alone.
533
+ if (tabInputLockTails.get(tabId) === newTail) {
534
+ tabInputLockTails.delete(tabId)
535
+ }
536
+ }
537
+ }
538
+
539
+ async function dispatchMouseEvent(tabId, type, x, y, button, buttons, clickCount) {
540
+ await chrome.debugger.sendCommand({ tabId }, "Input.dispatchMouseEvent", {
541
+ type, x, y, button, buttons, clickCount, modifiers: 0, pointerType: "mouse",
371
542
  })
372
- return { ok: true }
543
+ }
544
+
545
+ // Resolve ref / selector / (x,y) → { x, y, draggable?, hitTest? }.
546
+ // hitTest carries elementFromPoint topmost-element identity so the
547
+ // caller can decide whether the target is actually clickable or is
548
+ // occluded by an overlay (default behavior: refuse with target_obscured
549
+ // unless force:true).
550
+ // Resolve ref / selector / (x,y) → { x, y, draggable?, hitTest? }.
551
+ // hitTest carries elementFromPoint topmost-element identity so the
552
+ // caller can decide whether the target is actually clickable or is
553
+ // occluded by an overlay (default behavior: refuse with target_obscured
554
+ // unless force:true). Exclusivity (exactly ONE of ref / selector /
555
+ // (x,y)) is checked by the caller — see assertSingleTarget.
556
+ async function resolveMouseTarget(tabId, ref, selector, x, y) {
557
+ if (Number.isFinite(x) && Number.isFinite(y)) {
558
+ // Coordinate mode: no target identity, no hit-test (we don't know
559
+ // which element the caller expects to hit).
560
+ return { x: Math.round(x), y: Math.round(y), draggable: false, hitTest: null }
561
+ }
562
+ if (!ref && !selector) {
563
+ throw new Error("target required: provide ref, selector, or both x and y")
564
+ }
565
+ const [result] = await chrome.scripting.executeScript({
566
+ target: { tabId },
567
+ func: (ref, selector) => {
568
+ const sel = ref
569
+ ? `[data-gh-router-ref="${typeof CSS !== "undefined" && CSS.escape ? CSS.escape(ref) : ref.replace(/["\\]/g, "\\$&")}"]`
570
+ : selector
571
+ const el = document.querySelector(sel)
572
+ if (!el) return { error: `element not found: ${sel}` }
573
+ const rect = el.getBoundingClientRect()
574
+ const cx = Math.round(rect.x + rect.width / 2)
575
+ const cy = Math.round(rect.y + rect.height / 2)
576
+ const inView = cx >= 0 && cy >= 0 && cx <= window.innerWidth && cy <= window.innerHeight
577
+ const draggable = el.draggable === true
578
+ if (!inView) {
579
+ return {
580
+ x: cx, y: cy, draggable,
581
+ hitTest: { isTarget: false, note: "target center off-viewport" },
582
+ }
583
+ }
584
+ const top = document.elementFromPoint(cx, cy)
585
+ // isTarget: only accept when topmost IS the element, or topmost
586
+ // is a DESCENDANT of the element (clicking the child bubbles to
587
+ // the target). Do NOT accept top.contains(el) — that would be
588
+ // true whenever the topmost falls through to a parent (e.g. when
589
+ // el has pointer-events:none, or is fully covered by a sibling
590
+ // and elementFromPoint walks up to the container). That's
591
+ // exactly the "obscured" case we want to flag.
592
+ const isTarget = !!top && (top === el || el.contains(top))
593
+ let topmost = "(none)"
594
+ if (top) {
595
+ const id = top.id ? "#" + top.id : ""
596
+ const cls = top.className && typeof top.className === "string" ? "." + top.className.split(/\s+/).filter(Boolean).slice(0, 2).join(".") : ""
597
+ topmost = `${top.tagName.toLowerCase()}${id}${cls}`
598
+ }
599
+ return {
600
+ x: cx, y: cy, draggable,
601
+ hitTest: { isTarget, topmost },
602
+ }
603
+ },
604
+ args: [ref || null, selector || null],
605
+ })
606
+ if (!result || !result.result) {
607
+ throw new Error("target resolution failed: scripting.executeScript returned nothing")
608
+ }
609
+ if (result.result.error) throw new Error(result.result.error)
610
+ return result.result
611
+ }
612
+
613
+ // Validate that exactly one target descriptor is provided. The model
614
+ // must not silently win one over another — if both ref and (x,y) are
615
+ // passed, throw rather than silently picking. `prefix` is the tool /
616
+ // arg-group name for the error message.
617
+ function assertSingleTarget(prefix, ref, selector, x, y) {
618
+ const hasRef = !!ref
619
+ const hasSel = !!selector
620
+ const xSet = x !== undefined
621
+ const ySet = y !== undefined
622
+ if (xSet !== ySet) {
623
+ throw new Error(`${prefix}: x and y must be provided together`)
624
+ }
625
+ const hasCoords = xSet && ySet
626
+ const sources = (hasRef ? 1 : 0) + (hasSel ? 1 : 0) + (hasCoords ? 1 : 0)
627
+ if (sources === 0) {
628
+ throw new Error(`${prefix}: provide one of ref, selector, or (x, y)`)
629
+ }
630
+ if (sources > 1) {
631
+ throw new Error(`${prefix}: pass exactly one of ref, selector, or (x, y) — not multiple`)
632
+ }
633
+ }
634
+
635
+ function assertTabId(prefix, tabId) {
636
+ if (!Number.isInteger(tabId) || tabId < 0) {
637
+ throw new Error(`${prefix}: tabId must be a non-negative integer`)
638
+ }
639
+ }
640
+
641
+ async function toolMouse(args) {
642
+ const tabId = args.tabId
643
+ const action = args.action
644
+ assertTabId("browser_mouse", tabId)
645
+ if (!["move", "click", "dblclick", "down", "up"].includes(action)) {
646
+ throw new Error(`browser_mouse: action must be move|click|dblclick|down|up, got ${String(action)}`)
647
+ }
648
+ const buttonRaw = typeof args.button === "string" ? args.button : "left"
649
+ if (!["left", "right", "middle"].includes(buttonRaw)) {
650
+ throw new Error(`browser_mouse: button must be left|right|middle, got ${buttonRaw}`)
651
+ }
652
+ const button = buttonRaw
653
+ const buttonBits = BUTTON_BITS[button]
654
+ const steps = Math.round(clampNum(args.steps ?? 1, 1, 100))
655
+ const stepDelayMs = Math.round(clampNum(args.stepDelayMs ?? 8, 0, 50))
656
+ const force = args.force === true
657
+ const ref = typeof args.ref === "string" ? args.ref : null
658
+ const selector = typeof args.selector === "string" ? args.selector : null
659
+ const x = Number.isFinite(args.x) ? args.x : undefined
660
+ const y = Number.isFinite(args.y) ? args.y : undefined
661
+ assertSingleTarget("browser_mouse", ref, selector, x, y)
662
+
663
+ const target = await resolveMouseTarget(tabId, ref, selector, x, y)
664
+ if (target.hitTest && !target.hitTest.isTarget && !force) {
665
+ throw new Error(
666
+ `target_obscured: topmost is ${target.hitTest.topmost || target.hitTest.note}. Pass force:true to bypass.`,
667
+ )
668
+ }
669
+
670
+ return await withTabInputLock(tabId, async () => {
671
+ await attachDebuggerOnce(tabId)
672
+ // Interpolated approach: synthesise an origin point a bit away from
673
+ // the target and walk N steps in. We don't track a real cursor
674
+ // position across calls (MV3 SW dormancy would silently wipe it);
675
+ // the synthetic approach still fires the expected mouseMoved
676
+ // sequence for libraries that need a trajectory.
677
+ const path = steps > 1 ? interpolateApproach(target.x, target.y, steps) : [{ x: target.x, y: target.y }]
678
+ for (let i = 0; i < path.length; i++) {
679
+ await dispatchMouseEvent(tabId, "mouseMoved", path[i].x, path[i].y, "none", 0, 1)
680
+ if (i < path.length - 1 && stepDelayMs > 0) await sleep(stepDelayMs)
681
+ }
682
+ if (action === "move") {
683
+ return { ok: true, position: { x: target.x, y: target.y } }
684
+ }
685
+ if (action === "down") {
686
+ await dispatchMouseEvent(tabId, "mousePressed", target.x, target.y, button, buttonBits, 1)
687
+ return { ok: true }
688
+ }
689
+ if (action === "up") {
690
+ await dispatchMouseEvent(tabId, "mouseReleased", target.x, target.y, button, 0, 1)
691
+ return { ok: true }
692
+ }
693
+ if (action === "click") {
694
+ await dispatchMouseEvent(tabId, "mousePressed", target.x, target.y, button, buttonBits, 1)
695
+ await dispatchMouseEvent(tabId, "mouseReleased", target.x, target.y, button, 0, 1)
696
+ return { ok: true }
697
+ }
698
+ // dblclick: two press/release cycles with incrementing clickCount.
699
+ // A single press/release with clickCount:2 is NOT a real double-click;
700
+ // browsers expect two single clicks in quick succession with the
701
+ // clickCount on the second one bumped to 2, which is what fires the
702
+ // `dblclick` event.
703
+ await dispatchMouseEvent(tabId, "mousePressed", target.x, target.y, button, buttonBits, 1)
704
+ await dispatchMouseEvent(tabId, "mouseReleased", target.x, target.y, button, 0, 1)
705
+ await dispatchMouseEvent(tabId, "mousePressed", target.x, target.y, button, buttonBits, 2)
706
+ await dispatchMouseEvent(tabId, "mouseReleased", target.x, target.y, button, 0, 2)
707
+ return { ok: true }
708
+ })
709
+ }
710
+
711
+ function interpolateApproach(targetX, targetY, steps) {
712
+ // Synthetic origin: ~50px to the left and ~20px above the target so
713
+ // the first mouseMoved is a small approach rather than a teleport.
714
+ // Clamp to 0 so we never dispatch a negative-coordinate event near
715
+ // the viewport edge (some site code defensively bails on negative
716
+ // clientX/Y; some CDP versions reject outright).
717
+ const originX = Math.max(0, targetX - 50)
718
+ const originY = Math.max(0, targetY - 20)
719
+ const path = []
720
+ for (let i = 1; i <= steps; i++) {
721
+ const t = i / steps
722
+ path.push({
723
+ x: Math.round(originX + (targetX - originX) * t),
724
+ y: Math.round(originY + (targetY - originY) * t),
725
+ })
726
+ }
727
+ return path
728
+ }
729
+
730
+ async function toolDrag(args) {
731
+ const tabId = args.tabId
732
+ assertTabId("browser_drag", tabId)
733
+ const buttonRaw = typeof args.button === "string" ? args.button : "left"
734
+ if (!["left", "middle"].includes(buttonRaw)) {
735
+ throw new Error(`browser_drag: button must be left|middle, got ${buttonRaw}`)
736
+ }
737
+ const button = buttonRaw
738
+ const buttonBits = BUTTON_BITS[button]
739
+ const steps = Math.round(clampNum(args.steps ?? 15, 1, 100))
740
+ const stepDelayMs = Math.round(clampNum(args.stepDelayMs ?? 12, 0, 50))
741
+ const force = args.force === true
742
+ const modeRaw = typeof args.mode === "string" ? args.mode : "auto"
743
+ if (!["auto", "pointer", "html5"].includes(modeRaw)) {
744
+ throw new Error(`browser_drag: mode must be auto|pointer|html5, got ${modeRaw}`)
745
+ }
746
+
747
+ const fromRef = typeof args.fromRef === "string" ? args.fromRef : null
748
+ const fromSelector = typeof args.fromSelector === "string" ? args.fromSelector : null
749
+ const fromX = Number.isFinite(args.fromX) ? args.fromX : undefined
750
+ const fromY = Number.isFinite(args.fromY) ? args.fromY : undefined
751
+ assertSingleTarget("browser_drag.from", fromRef, fromSelector, fromX, fromY)
752
+ const toRef = typeof args.toRef === "string" ? args.toRef : null
753
+ const toSelector = typeof args.toSelector === "string" ? args.toSelector : null
754
+ const toX = Number.isFinite(args.toX) ? args.toX : undefined
755
+ const toY = Number.isFinite(args.toY) ? args.toY : undefined
756
+ assertSingleTarget("browser_drag.to", toRef, toSelector, toX, toY)
757
+
758
+ const from = await resolveMouseTarget(tabId, fromRef, fromSelector, fromX, fromY)
759
+ const to = await resolveMouseTarget(tabId, toRef, toSelector, toX, toY)
760
+ if (from.hitTest && !from.hitTest.isTarget && !force) {
761
+ throw new Error(
762
+ `target_obscured: drag source topmost is ${from.hitTest.topmost || from.hitTest.note}. Pass force:true to bypass.`,
763
+ )
764
+ }
765
+ const mode = modeRaw === "auto" ? (from.draggable ? "html5" : "pointer") : modeRaw
766
+
767
+ return await withTabInputLock(tabId, async () => {
768
+ await attachDebuggerOnce(tabId)
769
+ if (mode === "html5") {
770
+ await dragHtml5(tabId, from, to, button, buttonBits, steps, stepDelayMs)
771
+ } else {
772
+ await dragPointer(tabId, from, to, button, buttonBits, steps, stepDelayMs)
773
+ }
774
+ return { ok: true, mode_used: mode, from: { x: from.x, y: from.y }, to: { x: to.x, y: to.y } }
775
+ })
776
+ }
777
+
778
+ async function dragPointer(tabId, from, to, button, buttonBits, steps, stepDelayMs) {
779
+ // Pointer-event-based DnD (react-dnd, Sortable.js, mouse-event-driven
780
+ // drag handlers). Hold the button (buttons:buttonBits) throughout the
781
+ // intermediate mouseMoved events — without that bit set, pointer-event
782
+ // handlers see pointermove with buttons:0 and abort drag tracking.
783
+ //
784
+ // Safety: track pressed state. If ANY dispatch between mousePressed
785
+ // and mouseReleased throws (CDP timeout / target crash / nav / invalid
786
+ // coords), the finally block must still release the button — CDP mouse
787
+ // state is global per attachment, so a stuck press would poison every
788
+ // subsequent click on this tab. The per-tab mutex doesn't help; the
789
+ // renderer-side state survives.
790
+ await dispatchMouseEvent(tabId, "mouseMoved", from.x, from.y, "none", 0, 1)
791
+ let pressed = false
792
+ try {
793
+ await dispatchMouseEvent(tabId, "mousePressed", from.x, from.y, button, buttonBits, 1)
794
+ pressed = true
795
+ for (let i = 1; i <= steps; i++) {
796
+ const t = i / steps
797
+ const x = Math.round(from.x + (to.x - from.x) * t)
798
+ const y = Math.round(from.y + (to.y - from.y) * t)
799
+ await chrome.debugger.sendCommand({ tabId }, "Input.dispatchMouseEvent", {
800
+ type: "mouseMoved", x, y, button, buttons: buttonBits, modifiers: 0, pointerType: "mouse",
801
+ })
802
+ if (i < steps && stepDelayMs > 0) await sleep(stepDelayMs)
803
+ }
804
+ await dispatchMouseEvent(tabId, "mouseReleased", to.x, to.y, button, 0, 1)
805
+ pressed = false
806
+ } finally {
807
+ if (pressed) {
808
+ try {
809
+ await dispatchMouseEvent(tabId, "mouseReleased", to.x, to.y, button, 0, 1)
810
+ } catch {
811
+ // Swallow — don't mask the original error. A second failure here
812
+ // means the tab is in worse trouble than a stuck button.
813
+ }
814
+ }
815
+ }
816
+ }
817
+
818
+ async function dragHtml5(tabId, from, to, button, buttonBits, steps, stepDelayMs) {
819
+ // HTML5 native DnD (draggable="true" elements). Raw CDP mouse events
820
+ // CAN'T trigger Chromium's native dragstart pipeline — the only path
821
+ // is Input.setInterceptDrags(true) + Input.dispatchDragEvent. We
822
+ // press + move a few times to trigger drag-detect, capture the
823
+ // DragData via the dragIntercepted event, then dispatch dragEnter /
824
+ // dragOver / drop to the destination.
825
+ //
826
+ // Safety: same stuck-button concern as dragPointer. Track `pressed`
827
+ // and release in finally. Additionally, if the dragIntercepted event
828
+ // never arrives within the deadline, throw instead of silently
829
+ // returning ok:true — the model would otherwise reason from a
830
+ // phantom-new-state. Caller (toolDrag) can fall back to pointer mode.
831
+ let intercepted = null
832
+ const listener = (source, method, params) => {
833
+ if (source.tabId !== tabId) return
834
+ if (method === "Input.dragIntercepted" && params && params.data) {
835
+ intercepted = params.data
836
+ }
837
+ }
838
+ chrome.debugger.onEvent.addListener(listener)
839
+ let pressed = false
840
+ try {
841
+ await chrome.debugger.sendCommand({ tabId }, "Input.setInterceptDrags", { enabled: true })
842
+ await dispatchMouseEvent(tabId, "mouseMoved", from.x, from.y, "none", 0, 1)
843
+ await dispatchMouseEvent(tabId, "mousePressed", from.x, from.y, button, buttonBits, 1)
844
+ pressed = true
845
+ // A handful of intermediate moves to trigger drag-detect heuristics
846
+ // (Chromium fires dragstart after ~5px of movement with the button held).
847
+ const startMoves = Math.min(5, steps)
848
+ for (let i = 1; i <= startMoves; i++) {
849
+ const t = (i / startMoves) * 0.3 // partial progress toward dest
850
+ const x = Math.round(from.x + (to.x - from.x) * t)
851
+ const y = Math.round(from.y + (to.y - from.y) * t)
852
+ await chrome.debugger.sendCommand({ tabId }, "Input.dispatchMouseEvent", {
853
+ type: "mouseMoved", x, y, button, buttons: buttonBits, modifiers: 0, pointerType: "mouse",
854
+ })
855
+ if (stepDelayMs > 0) await sleep(stepDelayMs)
856
+ }
857
+ // Wait for the dragIntercepted event (up to 1s). Without this we
858
+ // wouldn't have the DragData payload to send to dispatchDragEvent.
859
+ const deadline = Date.now() + 1_000
860
+ while (!intercepted && Date.now() < deadline) {
861
+ await sleep(20)
862
+ }
863
+ if (!intercepted) {
864
+ // Source isn't actually html5-draggable, or page called
865
+ // event.preventDefault() on dragstart, or drag-detect heuristic
866
+ // didn't fire. DO NOT silently report success — the model would
867
+ // reason from a phantom state. Throw so toolDrag's caller knows
868
+ // to retry with mode:"pointer".
869
+ throw new Error("drag_failed: Input.dragIntercepted never arrived within 1s — source may not be html5-draggable or dragstart was prevented. Retry with mode:\"pointer\".")
870
+ }
871
+ await chrome.debugger.sendCommand({ tabId }, "Input.dispatchDragEvent", {
872
+ type: "dragEnter", x: to.x, y: to.y, data: intercepted,
873
+ })
874
+ await chrome.debugger.sendCommand({ tabId }, "Input.dispatchDragEvent", {
875
+ type: "dragOver", x: to.x, y: to.y, data: intercepted,
876
+ })
877
+ await chrome.debugger.sendCommand({ tabId }, "Input.dispatchDragEvent", {
878
+ type: "drop", x: to.x, y: to.y, data: intercepted,
879
+ })
880
+ await dispatchMouseEvent(tabId, "mouseReleased", to.x, to.y, button, 0, 1)
881
+ pressed = false
882
+ } finally {
883
+ chrome.debugger.onEvent.removeListener(listener)
884
+ if (pressed) {
885
+ try {
886
+ await dispatchMouseEvent(tabId, "mouseReleased", to.x, to.y, button, 0, 1)
887
+ } catch {
888
+ // Swallow — don't mask the original error.
889
+ }
890
+ }
891
+ try {
892
+ await chrome.debugger.sendCommand({ tabId }, "Input.setInterceptDrags", { enabled: false })
893
+ } catch {
894
+ // Ignore — turning intercept off on a fresh attach is harmless.
895
+ }
896
+ }
897
+ }
898
+
899
+ async function toolType(args) {
900
+ const tabId = args.tabId
901
+ assertTabId("browser_type", tabId)
902
+ const textRaw = typeof args.text === "string" ? args.text : undefined
903
+ if (typeof textRaw !== "string") throw new Error("browser_type: text (string) is required")
904
+ if (textRaw.length > 4096) {
905
+ throw new Error("browser_type: text exceeds 4096-character limit")
906
+ }
907
+ // Normalize CRLF / lone CR to LF so Windows-origin clipboard text and
908
+ // HTTP-response text don't throw the "invalid control char U+000D"
909
+ // rejection downstream. Models pasting from any source should "just
910
+ // work" — the user's intent for "\r\n" is unambiguously a newline.
911
+ const text = textRaw.replace(/\r\n?/g, "\n")
912
+ const delayMs = Math.round(clampNum(args.delayMs ?? 0, 0, 50))
913
+ // Validate: reject control chars not in our whitelist. \n, \t, \b are
914
+ // remapped to named keys (Enter / Tab / Backspace). \r is already
915
+ // normalized to \n above. Other control chars (< 0x20) have no key
916
+ // mapping and would produce junk events; reject up front so the model
917
+ // can route them through browser_keyboard.
918
+ for (const ch of text) {
919
+ const code = ch.codePointAt(0)
920
+ if (code === undefined) continue
921
+ if (code < 0x20 && code !== 0x0A && code !== 0x09 && code !== 0x08) {
922
+ const hex = code.toString(16).toUpperCase().padStart(4, "0")
923
+ throw new Error(
924
+ `invalid_text: control char U+${hex} not supported. browser_type whitelist: \\n=Enter, \\t=Tab, \\b=Backspace, \\r normalized to \\n. Use browser_keyboard for other control sequences.`,
925
+ )
926
+ }
927
+ }
928
+ return await withTabInputLock(tabId, async () => {
929
+ await attachDebuggerOnce(tabId)
930
+ let count = 0
931
+ for (const ch of text) {
932
+ const code = ch.codePointAt(0)
933
+ let key, codeStr, vkc, sendText
934
+ if (code === 0x0A) {
935
+ key = "Enter"; codeStr = "Enter"; vkc = 13; sendText = undefined
936
+ } else if (code === 0x09) {
937
+ key = "Tab"; codeStr = "Tab"; vkc = 9; sendText = undefined
938
+ } else if (code === 0x08) {
939
+ key = "Backspace"; codeStr = "Backspace"; vkc = 8; sendText = undefined
940
+ } else {
941
+ key = ch
942
+ // Punctuation table fills in real Windows-VK values for the
943
+ // characters whose naive `charCodeAt` would collide with
944
+ // unrelated VK codes (e.g. '.' = 46 = VK_DELETE). Letters and
945
+ // digits use their natural charCode (VK_A..VK_Z / VK_0..VK_9
946
+ // happen to match). Everything else: 0, and CDP infers event
947
+ // semantics from `key` + `text`. Without this, sites that
948
+ // fall back to `event.keyCode` for hotkey handling would see
949
+ // 0 for typed punctuation; with it they get the canonical VK.
950
+ const punctVk = PUNCT_TO_VK[ch]
951
+ if (/^[a-zA-Z0-9]$/.test(ch)) {
952
+ vkc = ch.toUpperCase().charCodeAt(0)
953
+ } else if (punctVk !== undefined) {
954
+ vkc = punctVk
955
+ } else {
956
+ vkc = 0
957
+ }
958
+ codeStr = deriveKeyCode(ch)
959
+ sendText = ch
960
+ }
961
+ // Correct CDP recipe: keyDown WITH text fires keydown + keypress +
962
+ // input together. Do NOT also send a separate `char` event — that
963
+ // would double-fire keypress/input on most sites.
964
+ const downParams = {
965
+ type: "keyDown",
966
+ key,
967
+ code: codeStr || undefined,
968
+ modifiers: 0,
969
+ windowsVirtualKeyCode: vkc,
970
+ }
971
+ if (sendText !== undefined) downParams.text = sendText
972
+ await chrome.debugger.sendCommand({ tabId }, "Input.dispatchKeyEvent", downParams)
973
+ await chrome.debugger.sendCommand({ tabId }, "Input.dispatchKeyEvent", {
974
+ type: "keyUp",
975
+ key,
976
+ code: codeStr || undefined,
977
+ modifiers: 0,
978
+ windowsVirtualKeyCode: vkc,
979
+ })
980
+ count++
981
+ if (delayMs > 0) await sleep(delayMs)
982
+ }
983
+ return { ok: true, chars: count }
984
+ }, TYPE_TAB_INPUT_LOCK_HOLD_CAP_MS)
985
+ }
986
+
987
+ // Windows VK codes for the printable punctuation that browser_type
988
+ // needs to send. Letters and digits aren't here — their natural
989
+ // charCode happens to match VK_A..VK_Z / VK_0..VK_9 and the typing
990
+ // loop derives those inline. This table covers the unshifted AND
991
+ // shift-modified character on each US-layout punctuation key; both
992
+ // map to the same physical-key VK (the shift state is implied by the
993
+ // `text` field, and we don't dispatch a separate shift keydown).
994
+ //
995
+ // Source: Windows VK reference (learn.microsoft.com/...windows-keyboard-codes)
996
+ // — VK_OEM_* are the layout-specific punctuation codes (US-QWERTY here).
997
+ const PUNCT_TO_VK = Object.freeze({
998
+ // Shift+number row
999
+ "!": 49, "@": 50, "#": 51, "$": 52, "%": 53,
1000
+ "^": 54, "&": 55, "*": 56, "(": 57, ")": 48,
1001
+ // VK_OEM_1 .. VK_OEM_7 + space
1002
+ ";": 186, ":": 186,
1003
+ "=": 187, "+": 187,
1004
+ ",": 188, "<": 188,
1005
+ "-": 189, "_": 189,
1006
+ ".": 190, ">": 190,
1007
+ "/": 191, "?": 191,
1008
+ "`": 192, "~": 192,
1009
+ "[": 219, "{": 219,
1010
+ "\\": 220, "|": 220,
1011
+ "]": 221, "}": 221,
1012
+ "'": 222, '"': 222,
1013
+ " ": 32, // VK_SPACE
1014
+ })
1015
+
1016
+ function deriveKeyCode(ch) {
1017
+ // Best-effort code field. Covers ASCII printable chars including
1018
+ // shift-modified punctuation (! → Digit1, @ → Digit2, < → Comma,
1019
+ // etc) so `event.code` reports the PHYSICAL key the char lives on
1020
+ // — sites that check `event.code === "Digit1"` for layout-aware
1021
+ // shortcuts work the same whether the user typed `1` or `!`.
1022
+ // Non-ASCII falls back to empty string.
1023
+ if (/^[a-zA-Z]$/.test(ch)) return "Key" + ch.toUpperCase()
1024
+ if (/^[0-9]$/.test(ch)) return "Digit" + ch
1025
+ if (ch === " ") return "Space"
1026
+ const map = {
1027
+ // Number-row shift partners
1028
+ "!": "Digit1", "@": "Digit2", "#": "Digit3", "$": "Digit4", "%": "Digit5",
1029
+ "^": "Digit6", "&": "Digit7", "*": "Digit8", "(": "Digit9", ")": "Digit0",
1030
+ // OEM keys (US-QWERTY)
1031
+ "-": "Minus", "_": "Minus",
1032
+ "=": "Equal", "+": "Equal",
1033
+ "[": "BracketLeft", "{": "BracketLeft",
1034
+ "]": "BracketRight", "}": "BracketRight",
1035
+ "\\": "Backslash", "|": "Backslash",
1036
+ ";": "Semicolon", ":": "Semicolon",
1037
+ "'": "Quote", '"': "Quote",
1038
+ ",": "Comma", "<": "Comma",
1039
+ ".": "Period", ">": "Period",
1040
+ "/": "Slash", "?": "Slash",
1041
+ "`": "Backquote", "~": "Backquote",
1042
+ }
1043
+ return map[ch] || ""
1044
+ }
1045
+
1046
+ async function toolLocate(args) {
1047
+ const tabId = args.tabId
1048
+ const ref = typeof args.ref === "string" ? args.ref : null
1049
+ const selector = typeof args.selector === "string" ? args.selector : null
1050
+ assertTabId("browser_locate", tabId)
1051
+ if (!ref && !selector) throw new Error("browser_locate: ref or selector is required")
1052
+ if (ref && selector) throw new Error("browser_locate: pass exactly one of ref or selector, not both")
1053
+ const [result] = await chrome.scripting.executeScript({
1054
+ target: { tabId },
1055
+ func: (ref, selector) => {
1056
+ const viewport = {
1057
+ width: window.innerWidth,
1058
+ height: window.innerHeight,
1059
+ devicePixelRatio: window.devicePixelRatio,
1060
+ scrollX: window.scrollX,
1061
+ scrollY: window.scrollY,
1062
+ }
1063
+ const sel = ref
1064
+ ? `[data-gh-router-ref="${typeof CSS !== "undefined" && CSS.escape ? CSS.escape(ref) : ref.replace(/["\\]/g, "\\$&")}"]`
1065
+ : selector
1066
+ const el = document.querySelector(sel)
1067
+ if (!el) return { found: false, viewport }
1068
+ const rect = el.getBoundingClientRect()
1069
+ const cx = Math.round(rect.x + rect.width / 2)
1070
+ const cy = Math.round(rect.y + rect.height / 2)
1071
+ const style = getComputedStyle(el)
1072
+ const visible =
1073
+ rect.width > 0 && rect.height > 0
1074
+ && style.display !== "none"
1075
+ && style.visibility !== "hidden"
1076
+ && parseFloat(style.opacity || "1") > 0
1077
+ const inView = cx >= 0 && cy >= 0 && cx <= window.innerWidth && cy <= window.innerHeight
1078
+ let topmostAtCenter = null
1079
+ if (inView) {
1080
+ const top = document.elementFromPoint(cx, cy)
1081
+ // Same hit-test rule as resolveMouseTarget: target IS topmost
1082
+ // or contains it as a descendant. Ancestor-containment (top
1083
+ // contains el) is FALSE here because that's the obscured case.
1084
+ const isTarget = !!top && (top === el || el.contains(top))
1085
+ const topRef = top && top.getAttribute ? top.getAttribute("data-gh-router-ref") : null
1086
+ topmostAtCenter = {
1087
+ isTarget,
1088
+ tag: top ? top.tagName.toLowerCase() : null,
1089
+ refOrSelector: topRef || null,
1090
+ }
1091
+ }
1092
+ return {
1093
+ found: true,
1094
+ bbox: [Math.round(rect.x), Math.round(rect.y), Math.round(rect.width), Math.round(rect.height)],
1095
+ center: [cx, cy],
1096
+ inView,
1097
+ visible,
1098
+ pointerEvents: style.pointerEvents,
1099
+ topmostAtCenter,
1100
+ viewport,
1101
+ }
1102
+ },
1103
+ args: [ref || null, selector || null],
1104
+ })
1105
+ if (!result || typeof result.result !== "object") {
1106
+ throw new Error("browser_locate: scripting.executeScript returned nothing")
1107
+ }
1108
+ return result.result
373
1109
  }
374
1110
 
375
1111
  async function toolWait(args) {
@@ -504,18 +1240,56 @@ const attachedTabs = new Set()
504
1240
  const MAX_BUFFER_ENTRIES = 1000
505
1241
 
506
1242
  async function attachDebuggerOnce(tabId, opts) {
507
- if (!attachedTabs.has(tabId)) {
508
- try { await chrome.debugger.attach({ tabId }, "1.3") } catch { /* may already be attached */ }
509
- attachedTabs.add(tabId)
510
- }
511
- if (opts?.console && !consoleBuffers.has(tabId)) {
512
- consoleBuffers.set(tabId, [])
513
- await chrome.debugger.sendCommand({ tabId }, "Runtime.enable")
514
- }
515
- if (opts?.network && !networkBuffers.has(tabId)) {
516
- networkBuffers.set(tabId, [])
517
- await chrome.debugger.sendCommand({ tabId }, "Network.enable")
518
- }
1243
+ // navigator.locks serializes concurrent attach attempts after MV3 SW
1244
+ // respawn (when the in-memory attachedTabs Set is wiped but Chrome may
1245
+ // have kept the underlying CDP attachment alive past the SW death).
1246
+ // Without this lock, two parallel tool calls would both call
1247
+ // chrome.debugger.attach and the loser would throw
1248
+ // "Another debugger is already attached to this target".
1249
+ //
1250
+ // The "already attached" branch is subtle: "already attached" can mean
1251
+ // (a) WE attached and the SW just lost the cache, OR (b) DevTools /
1252
+ // another extension owns the session and we DON'T. Don't blindly trust
1253
+ // (a) that would poison the cache and every subsequent sendCommand
1254
+ // would fail with cryptic CDP errors. Prove ownership with a no-op
1255
+ // Runtime.evaluate; only cache on success.
1256
+ await navigator.locks.request(`browser-mcp:debugger-attach:${tabId}`, async () => {
1257
+ if (!attachedTabs.has(tabId)) {
1258
+ let mustVerifyOwnership = false
1259
+ try {
1260
+ await chrome.debugger.attach({ tabId }, "1.3")
1261
+ } catch (err) {
1262
+ const msg = err && err.message ? err.message : String(err)
1263
+ const alreadyAttached = /already attached/i.test(msg) || /already debugging/i.test(msg)
1264
+ if (!alreadyAttached) throw err
1265
+ // "Already attached" — could be us (Chrome kept the attachment
1266
+ // past our SW death) or another debugger (DevTools open, etc.).
1267
+ // Don't cache yet; verify below.
1268
+ mustVerifyOwnership = true
1269
+ }
1270
+ if (mustVerifyOwnership) {
1271
+ try {
1272
+ await chrome.debugger.sendCommand({ tabId }, "Runtime.evaluate", {
1273
+ expression: "1", returnByValue: true,
1274
+ })
1275
+ // sendCommand succeeded → we own the attachment. Safe to cache.
1276
+ } catch {
1277
+ throw new Error(
1278
+ "browser-mcp: chrome.debugger reports attached but we do not own the session — likely DevTools is open on this tab (or another extension is debugging). Close DevTools and retry.",
1279
+ )
1280
+ }
1281
+ }
1282
+ attachedTabs.add(tabId)
1283
+ }
1284
+ if (opts?.console && !consoleBuffers.has(tabId)) {
1285
+ consoleBuffers.set(tabId, [])
1286
+ await chrome.debugger.sendCommand({ tabId }, "Runtime.enable")
1287
+ }
1288
+ if (opts?.network && !networkBuffers.has(tabId)) {
1289
+ networkBuffers.set(tabId, [])
1290
+ await chrome.debugger.sendCommand({ tabId }, "Network.enable")
1291
+ }
1292
+ })
519
1293
  }
520
1294
 
521
1295
  chrome.debugger.onEvent.addListener((source, method, params) => {
@@ -548,9 +1322,23 @@ chrome.debugger.onDetach.addListener((source) => {
548
1322
  attachedTabs.delete(source.tabId)
549
1323
  consoleBuffers.delete(source.tabId)
550
1324
  networkBuffers.delete(source.tabId)
1325
+ tabInputLockTails.delete(source.tabId)
551
1326
  }
552
1327
  })
553
1328
 
1329
+ // Clean per-tab state on tab close. attachedTabs / consoleBuffers /
1330
+ // networkBuffers are also cleaned by debugger.onDetach above (Chrome
1331
+ // detaches on tab close), but doing it here too is cheap and protects
1332
+ // against listener ordering surprises. tabInputLockTails is NOT
1333
+ // cleaned by onDetach in some scenarios (the lock-chain Map can leak
1334
+ // if a drag was in flight when the tab closed); cover it here.
1335
+ chrome.tabs.onRemoved.addListener((tabId) => {
1336
+ attachedTabs.delete(tabId)
1337
+ consoleBuffers.delete(tabId)
1338
+ networkBuffers.delete(tabId)
1339
+ tabInputLockTails.delete(tabId)
1340
+ })
1341
+
554
1342
  async function toolConsoleLogs(args) {
555
1343
  const tabId = typeof args.tabId === "number" ? args.tabId : undefined
556
1344
  const level = typeof args.level === "string" ? args.level : "all"
@@ -594,6 +1382,10 @@ const TOOL_HANDLERS = {
594
1382
  browser_download: toolDownload,
595
1383
  browser_console_logs: toolConsoleLogs,
596
1384
  browser_network_log: toolNetworkLog,
1385
+ browser_mouse: toolMouse,
1386
+ browser_drag: toolDrag,
1387
+ browser_type: toolType,
1388
+ browser_locate: toolLocate,
597
1389
  }
598
1390
 
599
1391
  // ---------------------------------------------------------------------