github-router 0.3.52 → 0.3.66

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/main.js CHANGED
@@ -45,6 +45,8 @@ const state = {
45
45
  showToken: false,
46
46
  extendedBetas: false,
47
47
  browseEnabled: false,
48
+ powerBrowseEnabled: false,
49
+ humanlikeForce: "auto",
48
50
  sessionId: randomUUID(),
49
51
  machineId: randomBytes(32).toString("hex")
50
52
  };
@@ -3076,6 +3078,58 @@ function installRequiredToolResult(payload) {
3076
3078
  };
3077
3079
  }
3078
3080
 
3081
+ //#endregion
3082
+ //#region src/lib/browser-mcp/humanlike.ts
3083
+ /**
3084
+ * Sample from a Beta(2, 5) distribution scaled to [minMs, maxMs].
3085
+ * The Beta(2, 5) shape has its mode near 0.2 of the range — humans
3086
+ * follow most actions quickly, with an occasional long pause. We do
3087
+ * NOT use uniform random because that would produce robotically-
3088
+ * even spacing detectable by behavioral analysis.
3089
+ *
3090
+ * Implementation: two gamma-distributed samples via the Marsaglia /
3091
+ * Tsang squeeze method (Box-Muller-style sufficiency for shape ≥ 2).
3092
+ */
3093
+ function betaDelay(minMs, maxMs) {
3094
+ const a = gammaSample(2);
3095
+ const beta = a / (a + gammaSample(5));
3096
+ return Math.round(minMs + beta * (maxMs - minMs));
3097
+ }
3098
+ function gammaSample(shape) {
3099
+ const d = shape - 1 / 3;
3100
+ const c = 1 / Math.sqrt(9 * d);
3101
+ while (true) {
3102
+ let x, v;
3103
+ do {
3104
+ x = normalSample();
3105
+ v = 1 + c * x;
3106
+ } while (v <= 0);
3107
+ v = v * v * v;
3108
+ const u = Math.random();
3109
+ if (u < 1 - .0331 * x * x * x * x) return d * v;
3110
+ if (Math.log(u) < .5 * x * x + d * (1 - v + Math.log(v))) return d * v;
3111
+ }
3112
+ }
3113
+ function normalSample() {
3114
+ let u = 0, v = 0;
3115
+ while (u === 0) u = Math.random();
3116
+ while (v === 0) v = Math.random();
3117
+ return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v);
3118
+ }
3119
+ /**
3120
+ * Inter-action delay when paced mode is on. Returns a Beta-shaped
3121
+ * randomized delay in [800, 4600] ms with a base of 600 ms so the
3122
+ * minimum is never "too fast." Humans take 800-2800 ms between
3123
+ * UI actions on average, with a tail of long pauses; this matches.
3124
+ *
3125
+ * Caller is expected to subtract the time already burned in the
3126
+ * compound pipeline (snapshot fetch + matcher cascade) so the user-
3127
+ * perceived delay isn't doubled.
3128
+ */
3129
+ function interActionDelay() {
3130
+ return betaDelay(800, 4600);
3131
+ }
3132
+
3079
3133
  //#endregion
3080
3134
  //#region src/lib/browser-mcp/policy.ts
3081
3135
  const BLOCKED_URL_RE = /^(chrome|edge|brave|opera|vivaldi):\/\/(settings|preferences|extensions|policy|management|password|flags|flag-descriptions)/i;
@@ -3110,6 +3164,78 @@ function preflightUrlPolicy(toolName, args) {
3110
3164
 
3111
3165
  //#endregion
3112
3166
  //#region src/lib/browser-mcp/dispatch.ts
3167
+ /**
3168
+ * Tools whose dispatch counts as a mutating user action for pacing
3169
+ * purposes. Read-only tools (list_tabs, screenshot, read_page,
3170
+ * diagnostics, navigate-without-form-submit) skip the inter-action
3171
+ * delay because they don't look like a human clicking around.
3172
+ */
3173
+ const PACED_TOOLS = new Set([
3174
+ "browser_click",
3175
+ "browser_fill",
3176
+ "browser_type",
3177
+ "browser_keyboard",
3178
+ "browser_scroll",
3179
+ "browser_mouse",
3180
+ "browser_drag"
3181
+ ]);
3182
+ let lastDispatchAt = 0;
3183
+ let humanlikeAutoCache = {
3184
+ fetchedAt: 0,
3185
+ tabs: /* @__PURE__ */ new Set()
3186
+ };
3187
+ const HUMANLIKE_PROBE_INTERVAL_MS = 5e3;
3188
+ async function isHumanlikeAutoOn(tabId, signal) {
3189
+ if (state.humanlikeForce === "off") return false;
3190
+ if (typeof tabId !== "number") return false;
3191
+ const now = Date.now();
3192
+ if (now - humanlikeAutoCache.fetchedAt > HUMANLIKE_PROBE_INTERVAL_MS) try {
3193
+ const ready = await ensureBridgeReady();
3194
+ if (ready.install_required) return false;
3195
+ const res = await fetch(`http://127.0.0.1:${ready.port}/health`, {
3196
+ headers: { authorization: `Bearer ${ready.token}` },
3197
+ signal
3198
+ });
3199
+ if (res.ok) {
3200
+ const body = await res.json();
3201
+ const tabs = /* @__PURE__ */ new Set();
3202
+ for (const t of body.humanlike_tabs ?? []) if (typeof t.tabId === "number") tabs.add(t.tabId);
3203
+ humanlikeAutoCache = {
3204
+ fetchedAt: now,
3205
+ tabs
3206
+ };
3207
+ }
3208
+ } catch {}
3209
+ return humanlikeAutoCache.tabs.has(tabId);
3210
+ }
3211
+ async function maybeInjectHumanlikeDelay(tool, signal, tabId) {
3212
+ if (!PACED_TOOLS.has(tool)) return;
3213
+ let on = state.humanlikeForce === "on";
3214
+ if (!on && state.humanlikeForce === "auto") on = await isHumanlikeAutoOn(tabId, signal);
3215
+ if (!on) return;
3216
+ const target = interActionDelay();
3217
+ const sinceLast = Date.now() - lastDispatchAt;
3218
+ const wait = Math.max(0, target - sinceLast);
3219
+ if (wait > 0) await sleepAbortable(wait, signal);
3220
+ lastDispatchAt = Date.now();
3221
+ }
3222
+ function sleepAbortable(ms, signal) {
3223
+ return new Promise((resolve, reject) => {
3224
+ if (signal?.aborted) {
3225
+ reject(/* @__PURE__ */ new Error("aborted"));
3226
+ return;
3227
+ }
3228
+ const timer = setTimeout(() => {
3229
+ if (signal) signal.removeEventListener("abort", onAbort);
3230
+ resolve();
3231
+ }, ms);
3232
+ const onAbort = () => {
3233
+ clearTimeout(timer);
3234
+ reject(/* @__PURE__ */ new Error("aborted"));
3235
+ };
3236
+ if (signal) signal.addEventListener("abort", onAbort, { once: true });
3237
+ });
3238
+ }
3113
3239
  const PER_TOOL_TIMEOUTS = {
3114
3240
  browser_list_tabs: {
3115
3241
  defaultMs: 5e3,
@@ -3275,6 +3401,7 @@ async function dispatchBrowserTool(tool, args, signal, opts = {}) {
3275
3401
  };
3276
3402
  const ready = await ensureBridgeReady();
3277
3403
  if (ready.install_required) return installRequiredToolResult(ready);
3404
+ await maybeInjectHumanlikeDelay(tool, signal, typeof args.tabId === "number" ? args.tabId : void 0);
3278
3405
  const { defaultMs, maxMs } = pickTimeout(tool);
3279
3406
  const callerTimeout = typeof opts.timeoutMs === "number" && opts.timeoutMs > 0 ? Math.min(opts.timeoutMs, maxMs) : defaultMs;
3280
3407
  try {
@@ -3355,6 +3482,517 @@ function logAudit$1(record) {
3355
3482
  })();
3356
3483
  }
3357
3484
 
3485
+ //#endregion
3486
+ //#region src/lib/browser-mcp/matcher.ts
3487
+ /**
3488
+ * Resolve an intent to an action. Synchronous, no I/O, <5ms expected
3489
+ * on a 200-element snapshot.
3490
+ *
3491
+ * Returns `{source: "escalate"}` when no layer produced a single
3492
+ * confident candidate. Caller is expected to invoke the fast-model
3493
+ * fallback path with the returned `candidates` shortlist (smaller
3494
+ * than the full snapshot, so fast-model token cost drops 3-5×).
3495
+ */
3496
+ function deterministicResolve(snapshot, parsed, value) {
3497
+ const v = value ?? parsed.valueFromIntent;
3498
+ const allCandidates = [];
3499
+ for (const layer of LAYERS) {
3500
+ const found = layer.run(snapshot, parsed, v);
3501
+ if (found.length === 0) continue;
3502
+ allCandidates.push(...found);
3503
+ const winners = applyTieBreakers(found, parsed);
3504
+ const top = winners[0];
3505
+ if (!top) continue;
3506
+ const runnerUp = winners[1];
3507
+ if (top.score >= layer.floor && (!runnerUp || top.score - runnerUp.score >= .15)) {
3508
+ const action = inferActionLocal(top.el.role, parsed, v);
3509
+ return {
3510
+ ref: top.el.ref,
3511
+ action,
3512
+ ...needsValue(action) && v !== void 0 ? { value: v } : {},
3513
+ confidence: top.score,
3514
+ source: layer.name,
3515
+ reason: top.reason
3516
+ };
3517
+ }
3518
+ }
3519
+ const shortlist = dedupeAndRank(allCandidates).slice(0, 8);
3520
+ return {
3521
+ ref: "",
3522
+ action: parsed.verb ?? "click",
3523
+ ...v !== void 0 ? { value: v } : {},
3524
+ confidence: 0,
3525
+ source: "escalate",
3526
+ reason: shortlist.length === 0 ? "no candidates from any cascade layer" : `${shortlist.length} ambiguous candidates`,
3527
+ candidates: shortlist.map((c) => ({
3528
+ ref: c.el.ref,
3529
+ score: c.score,
3530
+ layer: c.layer
3531
+ }))
3532
+ };
3533
+ }
3534
+ function applyTieBreakers(cands, parsed) {
3535
+ const verb = parsed.verb ?? "click";
3536
+ const dropDisabled = verb === "click" || verb === "fill" || verb === "type" || verb === "select";
3537
+ return cands.filter((c) => {
3538
+ if (c.el.hidden) return false;
3539
+ if (c.el.bbox && (c.el.bbox[2] < 4 || c.el.bbox[3] < 4)) return false;
3540
+ if (dropDisabled && c.el.disabled) return false;
3541
+ return true;
3542
+ }).map((c) => ({
3543
+ ...c,
3544
+ score: c.score * weight(c, verb)
3545
+ })).sort((a, b) => b.score - a.score);
3546
+ }
3547
+ function weight(c, verb) {
3548
+ let w = 1;
3549
+ const bbox = c.el.bbox;
3550
+ if (bbox) {
3551
+ if (!(bbox[0] >= 0 && bbox[1] >= 0)) w *= .92;
3552
+ }
3553
+ if (c.el.isInIframe) w *= .95;
3554
+ if (verb === "click") {
3555
+ const r = (c.el.role || "").toLowerCase();
3556
+ if (r === "button") w *= 1;
3557
+ else if (r === "link" || r === "a") w *= .98;
3558
+ else if (r === "menuitem") w *= .96;
3559
+ else if (r === "generic" || r === "div" || r === "span") w *= .9;
3560
+ }
3561
+ return Math.min(1, w);
3562
+ }
3563
+ function dedupeAndRank(cands) {
3564
+ const byRef = /* @__PURE__ */ new Map();
3565
+ for (const c of cands) {
3566
+ const existing = byRef.get(c.el.ref);
3567
+ if (!existing || existing.score < c.score) byRef.set(c.el.ref, c);
3568
+ }
3569
+ return [...byRef.values()].sort((a, b) => b.score - a.score);
3570
+ }
3571
+ function inferActionLocal(role, parsed, value) {
3572
+ if (parsed.verb === "scroll_into_view") return "scroll_into_view";
3573
+ const intentLower = parsed.rawTarget.toLowerCase();
3574
+ if (/\bscroll\b/.test(intentLower)) return "scroll_into_view";
3575
+ const r = (role || "").toLowerCase();
3576
+ if (r === "select" || r === "combobox") return "select";
3577
+ if (r === "textarea" || r === "input" || r === "textbox" || r === "searchbox" || r === "spinbutton") {
3578
+ if (parsed.verb === "type") return "type";
3579
+ if (parsed.verb === "fill") return "fill";
3580
+ return value !== void 0 ? "fill" : "click";
3581
+ }
3582
+ return parsed.verb ?? "click";
3583
+ }
3584
+ function needsValue(action) {
3585
+ return action === "fill" || action === "type" || action === "select";
3586
+ }
3587
+ function nameOf(el) {
3588
+ return (el.name ?? "").trim();
3589
+ }
3590
+ function nameLowerOf(el) {
3591
+ return nameOf(el).toLowerCase();
3592
+ }
3593
+ function isClickableRole(role) {
3594
+ const r = role.toLowerCase();
3595
+ return r === "button" || r === "link" || r === "a" || r === "menuitem" || r === "tab" || r === "checkbox" || r === "radio" || r === "switch" || r === "option" || r === "treeitem";
3596
+ }
3597
+ function isInputRole(role) {
3598
+ const r = role.toLowerCase();
3599
+ return r === "textbox" || r === "input" || r === "textarea" || r === "searchbox" || r === "spinbutton" || r === "combobox" || r === "select" || r === "checkbox" || r === "radio";
3600
+ }
3601
+ function verbCompatible(role, verb) {
3602
+ if (!verb || verb === "click") return isClickableRole(role) || isInputRole(role);
3603
+ if (verb === "fill" || verb === "type" || verb === "select") return isInputRole(role);
3604
+ return true;
3605
+ }
3606
+ function wholeWordContains(haystack, needle) {
3607
+ if (!haystack || !needle) return false;
3608
+ return new RegExp(`\\b${needle.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`, "i").test(haystack);
3609
+ }
3610
+ const L0 = {
3611
+ name: "L0",
3612
+ floor: .95,
3613
+ run: (snapshot, parsed) => {
3614
+ const target = parsed.quotedName ?? parsed.normTarget;
3615
+ if (!target) return [];
3616
+ const out = [];
3617
+ for (const el of snapshot.elements) {
3618
+ if (!verbCompatible(el.role, parsed.verb)) continue;
3619
+ const nm = nameLowerOf(el);
3620
+ if (!nm) continue;
3621
+ if (nm === target.toLowerCase()) out.push({
3622
+ el,
3623
+ score: 1,
3624
+ layer: "L0",
3625
+ reason: `L0 exact name "${el.name}"`
3626
+ });
3627
+ }
3628
+ return out;
3629
+ }
3630
+ };
3631
+ const L1 = {
3632
+ name: "L1",
3633
+ floor: .9,
3634
+ run: (snapshot, parsed) => {
3635
+ if (parsed.verb && parsed.verb !== "fill" && parsed.verb !== "type" && parsed.verb !== "select") return [];
3636
+ const target = parsed.fieldHint ?? parsed.normTarget;
3637
+ if (!target) return [];
3638
+ const tgt = target.toLowerCase();
3639
+ const out = [];
3640
+ for (const el of snapshot.elements) {
3641
+ if (!isInputRole(el.role)) continue;
3642
+ const nm = nameLowerOf(el);
3643
+ if (nm === tgt || nm === `${tgt} *` || nm === `${tgt} (required)` || nm.endsWith(tgt) && /^[\s*()required:_-]+/.test(nm.slice(0, nm.length - tgt.length))) out.push({
3644
+ el,
3645
+ score: .95,
3646
+ layer: "L1",
3647
+ reason: `L1 label "${el.name}"`
3648
+ });
3649
+ }
3650
+ return out;
3651
+ }
3652
+ };
3653
+ const L2 = {
3654
+ name: "L2",
3655
+ floor: .7,
3656
+ run: (snapshot, parsed) => {
3657
+ const target = parsed.fieldHint ?? parsed.normTarget;
3658
+ if (!target) return [];
3659
+ const tgt = target.toLowerCase();
3660
+ const out = [];
3661
+ for (const el of snapshot.elements) {
3662
+ if (!isInputRole(el.role)) continue;
3663
+ const ph = (el.placeholder ?? "").toLowerCase();
3664
+ if (!ph) continue;
3665
+ if (ph === tgt) out.push({
3666
+ el,
3667
+ score: .85,
3668
+ layer: "L2",
3669
+ reason: `L2 placeholder exact "${el.placeholder}"`
3670
+ });
3671
+ else if (wholeWordContains(ph, tgt)) out.push({
3672
+ el,
3673
+ score: .75,
3674
+ layer: "L2",
3675
+ reason: `L2 placeholder contains "${tgt}"`
3676
+ });
3677
+ }
3678
+ return out;
3679
+ }
3680
+ };
3681
+ const L3 = {
3682
+ name: "L3",
3683
+ floor: .65,
3684
+ run: (snapshot, parsed) => {
3685
+ const target = parsed.normTarget;
3686
+ if (!target) return [];
3687
+ const out = [];
3688
+ for (const el of snapshot.elements) {
3689
+ if (!verbCompatible(el.role, parsed.verb)) continue;
3690
+ const nm = nameOf(el);
3691
+ if (!nm) continue;
3692
+ if (!wholeWordContains(nm, target)) continue;
3693
+ const score = target.length / nm.length >= .8 ? .72 : .68;
3694
+ out.push({
3695
+ el,
3696
+ score,
3697
+ layer: "L3",
3698
+ reason: `L3 fuzzy name "${nm}"`
3699
+ });
3700
+ }
3701
+ return out;
3702
+ }
3703
+ };
3704
+ const L4 = {
3705
+ name: "L4",
3706
+ floor: .6,
3707
+ run: (snapshot, parsed) => {
3708
+ const target = parsed.normTarget;
3709
+ if (!target) return [];
3710
+ const out = [];
3711
+ for (const el of snapshot.elements) {
3712
+ if (!isClickableRole(el.role)) continue;
3713
+ const text = (el.value ?? "").toLowerCase().trim();
3714
+ if (!text) continue;
3715
+ const tgt = target.toLowerCase();
3716
+ if (text === tgt) out.push({
3717
+ el,
3718
+ score: .65,
3719
+ layer: "L4",
3720
+ reason: `L4 text exact "${el.value}"`
3721
+ });
3722
+ else if (wholeWordContains(text, tgt)) out.push({
3723
+ el,
3724
+ score: .6,
3725
+ layer: "L4",
3726
+ reason: `L4 text contains "${tgt}"`
3727
+ });
3728
+ }
3729
+ return out;
3730
+ }
3731
+ };
3732
+ const L5 = {
3733
+ name: "L5",
3734
+ floor: .85,
3735
+ run: (snapshot, parsed) => {
3736
+ const target = parsed.normTarget;
3737
+ if (!target) return [];
3738
+ if (!/^[a-z][a-z0-9_-]{2,}$/i.test(target)) return [];
3739
+ const norm = target.toLowerCase().replace(/[-_]/g, "");
3740
+ const out = [];
3741
+ for (const el of snapshot.elements) {
3742
+ const attrs = el.attrs;
3743
+ if (!attrs) continue;
3744
+ if (attrs.testid && stripSep(attrs.testid).toLowerCase() === norm) {
3745
+ out.push({
3746
+ el,
3747
+ score: .9,
3748
+ layer: "L5",
3749
+ reason: `L5 testid="${attrs.testid}"`
3750
+ });
3751
+ continue;
3752
+ }
3753
+ if (attrs.id && stripSep(attrs.id).toLowerCase() === norm) {
3754
+ out.push({
3755
+ el,
3756
+ score: .88,
3757
+ layer: "L5",
3758
+ reason: `L5 id="${attrs.id}"`
3759
+ });
3760
+ continue;
3761
+ }
3762
+ if (attrs.name_attr && stripSep(attrs.name_attr).toLowerCase() === norm) {
3763
+ out.push({
3764
+ el,
3765
+ score: .86,
3766
+ layer: "L5",
3767
+ reason: `L5 name="${attrs.name_attr}"`
3768
+ });
3769
+ continue;
3770
+ }
3771
+ if (attrs.aria_label && stripSep(attrs.aria_label).toLowerCase() === norm) out.push({
3772
+ el,
3773
+ score: .86,
3774
+ layer: "L5",
3775
+ reason: `L5 aria-label="${attrs.aria_label}"`
3776
+ });
3777
+ }
3778
+ return out;
3779
+ }
3780
+ };
3781
+ function stripSep(s) {
3782
+ return s.replace(/[-_\s]/g, "");
3783
+ }
3784
+ const LAYERS = [
3785
+ L0,
3786
+ L1,
3787
+ L2,
3788
+ L3,
3789
+ L4,
3790
+ L5,
3791
+ {
3792
+ name: "L6",
3793
+ floor: .75,
3794
+ run: (snapshot, parsed) => {
3795
+ if (!parsed.ordinal) return [];
3796
+ const { n, kind } = parsed.ordinal;
3797
+ const candidates = snapshot.elements.filter((el) => {
3798
+ if (!kind) return true;
3799
+ const role = el.role.toLowerCase();
3800
+ return role === kind || role === `${kind}s` || (el.tag ?? "").toLowerCase() === kind;
3801
+ });
3802
+ if (candidates.length < Math.abs(n)) return [];
3803
+ const sorted = [...candidates].sort((a, b) => {
3804
+ const ay = Math.floor(a.bbox[1] / 24);
3805
+ const by = Math.floor(b.bbox[1] / 24);
3806
+ if (ay !== by) return ay - by;
3807
+ return a.bbox[0] - b.bbox[0];
3808
+ });
3809
+ const idx = n === -1 ? sorted.length - 1 : n - 1;
3810
+ if (idx < 0 || idx >= sorted.length) return [];
3811
+ return [{
3812
+ el: sorted[idx],
3813
+ score: .8,
3814
+ layer: "L6",
3815
+ reason: `L6 ordinal pick #${n} of ${sorted.length} ${kind ?? "elements"}`
3816
+ }];
3817
+ }
3818
+ },
3819
+ {
3820
+ name: "L7",
3821
+ floor: .5,
3822
+ run: (snapshot, parsed) => {
3823
+ const hint = parsed.fieldHint ?? parsed.normTarget;
3824
+ if (!hint) return [];
3825
+ const h = hint.toLowerCase();
3826
+ const out = [];
3827
+ const inputRolePred = (el) => isInputRole(el.role);
3828
+ if (h === "email") {
3829
+ for (const el of snapshot.elements) if (el.inputType === "email" || inputRolePred(el) && (wholeWordContains(el.placeholder ?? "", "email") || wholeWordContains(el.name ?? "", "email"))) out.push({
3830
+ el,
3831
+ score: .55,
3832
+ layer: "L7",
3833
+ reason: "L7 email heuristic"
3834
+ });
3835
+ } else if (h === "password") {
3836
+ for (const el of snapshot.elements) if (el.inputType === "password" || inputRolePred(el) && wholeWordContains(el.name ?? "", "password")) out.push({
3837
+ el,
3838
+ score: .55,
3839
+ layer: "L7",
3840
+ reason: "L7 password heuristic"
3841
+ });
3842
+ } else if (h === "search") {
3843
+ for (const el of snapshot.elements) if (el.role === "searchbox" || el.inputType === "search" || inputRolePred(el) && wholeWordContains(el.name ?? "", "search")) out.push({
3844
+ el,
3845
+ score: .55,
3846
+ layer: "L7",
3847
+ reason: "L7 search heuristic"
3848
+ });
3849
+ } else if (h === "phone" || h === "tel") {
3850
+ for (const el of snapshot.elements) if (el.inputType === "tel" || inputRolePred(el) && wholeWordContains(el.name ?? "", "phone")) out.push({
3851
+ el,
3852
+ score: .55,
3853
+ layer: "L7",
3854
+ reason: "L7 phone heuristic"
3855
+ });
3856
+ } else if (h === "submit" || h === "sign in" || h === "signin" || h === "log in" || h === "login") {
3857
+ const sumRe = /^(submit|send|continue|next|save|sign[\s-]?in|sign[\s-]?up|log[\s-]?in)$/i;
3858
+ for (const el of snapshot.elements) if (el.role === "button" && sumRe.test(el.name ?? "")) out.push({
3859
+ el,
3860
+ score: .55,
3861
+ layer: "L7",
3862
+ reason: "L7 submit heuristic"
3863
+ });
3864
+ } else if (h === "username" || h === "user") {
3865
+ for (const el of snapshot.elements) if (inputRolePred(el) && (wholeWordContains(el.name ?? "", "user") || wholeWordContains(el.name ?? "", "login") || wholeWordContains(el.name ?? "", "account"))) out.push({
3866
+ el,
3867
+ score: .55,
3868
+ layer: "L7",
3869
+ reason: "L7 username heuristic"
3870
+ });
3871
+ }
3872
+ return out;
3873
+ }
3874
+ }
3875
+ ];
3876
+
3877
+ //#endregion
3878
+ //#region src/lib/browser-mcp/parse-intent.ts
3879
+ const VERB_RE = /^\s*(click|press|tap|fill|enter|type|select|choose|scroll(?:[ -]?into[ -]?view)?|toggle|check|uncheck|open|focus|hover)\s+/i;
3880
+ const VALUE_RE = /\s+(?:with|to|=)\s+(.+?)\s*$/i;
3881
+ const QUOTED_RE = /["'`]([^"'`]+)["'`]/;
3882
+ const TITLE_CASE_RE = /\b([A-Z][\w]*(?:\s+[A-Z\d][\w]*){0,3})\b/;
3883
+ const ORDINAL_WORDS = {
3884
+ first: 1,
3885
+ second: 2,
3886
+ third: 3,
3887
+ fourth: 4,
3888
+ fifth: 5,
3889
+ sixth: 6,
3890
+ seventh: 7,
3891
+ eighth: 8,
3892
+ ninth: 9,
3893
+ tenth: 10,
3894
+ last: -1
3895
+ };
3896
+ const ORDINAL_WORD_RE = /\b(first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|last)\s+(\w+)/i;
3897
+ const ORDINAL_NUM_RE = /\b(\d+)(?:st|nd|rd|th)?\s+(\w+)/i;
3898
+ const FIELD_HINT_KINDS = [
3899
+ "field",
3900
+ "input",
3901
+ "textbox",
3902
+ "box",
3903
+ "search",
3904
+ "dropdown",
3905
+ "select",
3906
+ "menu",
3907
+ "button",
3908
+ "link",
3909
+ "tab",
3910
+ "checkbox",
3911
+ "radio",
3912
+ "switch"
3913
+ ];
3914
+ const FIELD_HINT_RE = new RegExp(`\\b(\\w+)\\s+(?:${FIELD_HINT_KINDS.join("|")})\\b`, "i");
3915
+ const ARTICLES_RE = /\b(the|a|an|this|that)\b/gi;
3916
+ /**
3917
+ * Parse a natural-language intent into structured parts.
3918
+ *
3919
+ * Returns a fully-formed `ParsedIntent` even for unparseable inputs
3920
+ * (rawTarget = the trimmed intent, normTarget = its lowercased
3921
+ * normalization, every other field undefined). The matcher cascade
3922
+ * handles "I don't know what to do" by falling through layer-by-
3923
+ * layer until L7 or escalate; an unparseable intent simply has
3924
+ * less signal for the layers to key on.
3925
+ */
3926
+ function parseIntent(intent) {
3927
+ let work = String(intent ?? "").trim();
3928
+ let verb;
3929
+ const verbMatch = VERB_RE.exec(work);
3930
+ if (verbMatch) {
3931
+ verb = mapVerb(verbMatch[1]);
3932
+ work = work.slice(verbMatch[0].length);
3933
+ }
3934
+ let valueFromIntent;
3935
+ const valueMatch = VALUE_RE.exec(work);
3936
+ if (valueMatch) {
3937
+ valueFromIntent = valueMatch[1].trim();
3938
+ work = work.slice(0, valueMatch.index).trim();
3939
+ }
3940
+ let quotedName;
3941
+ const quotedMatch = QUOTED_RE.exec(work);
3942
+ if (quotedMatch) quotedName = quotedMatch[1].trim();
3943
+ else {
3944
+ const titleMatch = TITLE_CASE_RE.exec(work);
3945
+ if (titleMatch) quotedName = titleMatch[1].trim();
3946
+ }
3947
+ let ordinal;
3948
+ const ordWordMatch = ORDINAL_WORD_RE.exec(work);
3949
+ if (ordWordMatch) {
3950
+ const n = ORDINAL_WORDS[ordWordMatch[1].toLowerCase()];
3951
+ if (typeof n === "number") ordinal = {
3952
+ n,
3953
+ kind: ordWordMatch[2].toLowerCase()
3954
+ };
3955
+ } else {
3956
+ const ordNumMatch = ORDINAL_NUM_RE.exec(work);
3957
+ if (ordNumMatch) ordinal = {
3958
+ n: Number.parseInt(ordNumMatch[1], 10),
3959
+ kind: ordNumMatch[2].toLowerCase()
3960
+ };
3961
+ }
3962
+ let fieldHint;
3963
+ const fieldMatch = FIELD_HINT_RE.exec(work);
3964
+ if (fieldMatch) fieldHint = fieldMatch[1].toLowerCase();
3965
+ const rawTarget = work.trim();
3966
+ let normTarget = rawTarget.toLowerCase().replace(ARTICLES_RE, "").replace(/\s+/g, " ").trim();
3967
+ for (const kind of FIELD_HINT_KINDS) {
3968
+ const tail = new RegExp(`\\s+${kind}$`, "i");
3969
+ if (tail.test(normTarget)) {
3970
+ normTarget = normTarget.replace(tail, "").trim();
3971
+ break;
3972
+ }
3973
+ }
3974
+ if (ordinal) normTarget = normTarget.replace(/^(\d+(?:st|nd|rd|th)?|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|last)\s+/i, "").trim();
3975
+ const out = {
3976
+ rawTarget,
3977
+ normTarget
3978
+ };
3979
+ if (verb) out.verb = verb;
3980
+ if (quotedName) out.quotedName = quotedName;
3981
+ if (fieldHint) out.fieldHint = fieldHint;
3982
+ if (ordinal) out.ordinal = ordinal;
3983
+ if (valueFromIntent !== void 0) out.valueFromIntent = valueFromIntent;
3984
+ return out;
3985
+ }
3986
+ function mapVerb(raw) {
3987
+ const v = raw.toLowerCase();
3988
+ if (v === "click" || v === "press" || v === "tap" || v === "toggle" || v === "check" || v === "uncheck" || v === "open") return "click";
3989
+ if (v === "fill" || v === "enter") return "fill";
3990
+ if (v === "type") return "type";
3991
+ if (v === "select" || v === "choose") return "select";
3992
+ if (v === "scroll" || v === "scrollintoview" || v === "scroll into view" || v === "scroll-into-view") return "scroll_into_view";
3993
+ if (v === "hover" || v === "focus") return void 0;
3994
+ }
3995
+
3358
3996
  //#endregion
3359
3997
  //#region src/lib/mcp-inflight.ts
3360
3998
  /**
@@ -3659,6 +4297,21 @@ async function callCompressor(systemPrompt, userMessage, tool, signal) {
3659
4297
  }
3660
4298
  }
3661
4299
  /**
4300
+ * Public re-export of `callCompressor` for sibling modules that need
4301
+ * the same forced-tool-calling pipeline (slot acquisition, fallback-
4302
+ * chain backend, code-fence stripping). Used by `observe.ts` to drive
4303
+ * the natural-language describer through the same backend the matcher
4304
+ * cascade escalates to, and by `decompose-planner.ts` for the
4305
+ * fast-model compound-step replanner.
4306
+ *
4307
+ * Kept as a thin wrapper rather than re-exporting `callCompressor`
4308
+ * directly so the underlying function can change signature without
4309
+ * breaking the public surface.
4310
+ */
4311
+ async function callCompressorPublic(systemPrompt, userMessage, tool, signal) {
4312
+ return callCompressor(systemPrompt, userMessage, tool, signal);
4313
+ }
4314
+ /**
3662
4315
  * Strip a single leading / trailing ``` (or ```json) code fence from a
3663
4316
  * model's free-form text reply so JSON.parse works. Idempotent on
3664
4317
  * fence-free input. Defensive against the failure mode caught in PR #55
@@ -3680,12 +4333,32 @@ function stripCodeFence(text) {
3680
4333
  * whether the intent supplied a value. Single source of truth for
3681
4334
  * element matching.
3682
4335
  *
4336
+ * Phase 2 short-circuits the common case through the deterministic
4337
+ * matcher cascade in `./matcher.ts` — pure-sync, no LLM round-trip,
4338
+ * <5ms on a 200-element snapshot. Only when the cascade returns
4339
+ * `source: "escalate"` (0 candidates or >1 ambiguous candidates) do
4340
+ * we fall through to the existing fast-model `pickMatchingElements`
4341
+ * path. When we DO escalate, we pass the cascade's pre-filtered
4342
+ * top-K shortlist along so the fast model sees ~8 candidates instead
4343
+ * of the full 200-element snapshot — 3-5× token-cost reduction even
4344
+ * on misses.
4345
+ *
3683
4346
  * Returns ref="" + confidence=0 when no element matches — caller
3684
4347
  * should escalate to visual fallback (when `visualSurfaces` is
3685
4348
  * present) or surface the miss to the lead model.
3686
4349
  */
3687
4350
  async function pickElement(snapshot, intent, signal, value) {
3688
- const matches = await pickMatchingElements(snapshot, intent, signal);
4351
+ const det = deterministicResolve(snapshot, parseIntent(intent), value);
4352
+ if (det.source !== "escalate" && det.ref !== "") {
4353
+ const out$1 = {
4354
+ ref: det.ref,
4355
+ action: det.action,
4356
+ confidence: det.confidence
4357
+ };
4358
+ if (det.value !== void 0) out$1.value = det.value;
4359
+ return out$1;
4360
+ }
4361
+ const matches = await pickMatchingElements(snapshot, intent, signal, det.candidates);
3689
4362
  if (matches.length === 0) return {
3690
4363
  ref: "",
3691
4364
  action: "click",
@@ -3756,9 +4429,28 @@ const FIND_ELEMENTS_TOOL = {
3756
4429
  * Return up to 5 candidate matches for an intent. Used by
3757
4430
  * `browser_find` — the lead model gets a small ranked list rather than
3758
4431
  * a full element dump. Empty array when nothing matches.
3759
- */
3760
- async function pickMatchingElements(snapshot, intent, signal) {
3761
- const trimmed = snapshot.elements.map((e) => ({
4432
+ *
4433
+ * Phase 2 short-circuits via the deterministic matcher cascade when
4434
+ * possible. When the cascade finds a single confident match, we
4435
+ * synthesize a one-item `FindMatch[]` and skip the fast-model
4436
+ * round-trip. When the cascade's `candidates` shortlist is passed in
4437
+ * by `pickElement` (escalation path), we trim the snapshot to just
4438
+ * those refs before sending to the fast model — keeps tokens down on
4439
+ * misses too.
4440
+ */
4441
+ async function pickMatchingElements(snapshot, intent, signal, shortlist) {
4442
+ if (!shortlist) {
4443
+ const det = deterministicResolve(snapshot, parseIntent(intent));
4444
+ if (det.source !== "escalate" && det.ref !== "") {
4445
+ if (snapshot.elements.find((e) => e.ref === det.ref)) return [{
4446
+ ref: det.ref,
4447
+ reason: `deterministic ${det.source}: ${det.reason}`
4448
+ }];
4449
+ }
4450
+ shortlist = det.candidates;
4451
+ }
4452
+ const refSet = shortlist && shortlist.length > 0 ? new Set(shortlist.map((s) => s.ref)) : void 0;
4453
+ const trimmed = (refSet ? snapshot.elements.filter((e) => refSet.has(e.ref)) : snapshot.elements).map((e) => ({
3762
4454
  ref: e.ref,
3763
4455
  role: e.role,
3764
4456
  name: e.name
@@ -3934,6 +4626,235 @@ async function pickElementVisual(screenshotB64, contentType, intent, visualSurfa
3934
4626
  };
3935
4627
  }
3936
4628
 
4629
+ //#endregion
4630
+ //#region src/lib/browser-mcp/decompose.ts
4631
+ const LOGIN_RE = /^log[ -]?in (?:to .+? )?with\s+([^\s/]+)\s*\/\s*(.+?)\s*$/i;
4632
+ const SEARCH_CLICK_RE = /^search\s+(?:for\s+)?(.+?)\s+and\s+click\s+(?:the\s+)?first\s+result\s*$/i;
4633
+ const CONJUNCTION_SPLIT_RE = /\s*(?:\s+and\s+then\s+|\s+then\s+|\s*;\s*|\s*,\s+and\s+)\s*/i;
4634
+ /**
4635
+ * Decompose a natural-language intent into atomic steps.
4636
+ *
4637
+ * The fallback path returns a single-step `[{intent: rawIntent}]` —
4638
+ * `browser_act` behaves identically to today's single-step dispatch
4639
+ * when no template matches.
4640
+ */
4641
+ function decompose(intent, value) {
4642
+ const raw = String(intent ?? "").trim();
4643
+ if (!raw) return {
4644
+ steps: [{
4645
+ intent: "",
4646
+ ...value !== void 0 ? { value } : {}
4647
+ }],
4648
+ template: "fallback"
4649
+ };
4650
+ const loginMatch = LOGIN_RE.exec(raw);
4651
+ if (loginMatch) {
4652
+ const user = loginMatch[1].trim();
4653
+ const pass = loginMatch[2].trim();
4654
+ return {
4655
+ steps: [
4656
+ {
4657
+ intent: "the email or username input",
4658
+ value: user
4659
+ },
4660
+ {
4661
+ intent: "the password input",
4662
+ value: pass
4663
+ },
4664
+ { intent: "the Sign in or Log in button" }
4665
+ ],
4666
+ template: "login",
4667
+ successSummary: "logged in"
4668
+ };
4669
+ }
4670
+ const searchMatch = SEARCH_CLICK_RE.exec(raw);
4671
+ if (searchMatch) {
4672
+ const query = searchMatch[1].trim();
4673
+ return {
4674
+ steps: [
4675
+ {
4676
+ intent: "the search input",
4677
+ value: query
4678
+ },
4679
+ { intent: "the search button or submit" },
4680
+ { intent: "the first search result" }
4681
+ ],
4682
+ template: "search_click",
4683
+ successSummary: `searched for "${query}" and opened first result`
4684
+ };
4685
+ }
4686
+ if (CONJUNCTION_SPLIT_RE.test(raw)) {
4687
+ const parts = raw.split(CONJUNCTION_SPLIT_RE).map((p) => p.trim()).filter(Boolean);
4688
+ if (parts.length >= 2) return {
4689
+ steps: parts.map((p, i) => {
4690
+ if (i === 0 && value !== void 0) return {
4691
+ intent: p,
4692
+ value
4693
+ };
4694
+ return { intent: p };
4695
+ }),
4696
+ template: "conjunction"
4697
+ };
4698
+ }
4699
+ return {
4700
+ steps: [{
4701
+ intent: raw,
4702
+ ...value !== void 0 ? { value } : {}
4703
+ }],
4704
+ template: "fallback"
4705
+ };
4706
+ }
4707
+
4708
+ //#endregion
4709
+ //#region src/lib/browser-mcp/observe.ts
4710
+ const OBSERVE_SYSTEM = `You describe a web page for an AI assistant that cannot see the DOM.
4711
+
4712
+ Write 2-4 sentences focused on user-actionable elements (forms, buttons, links) and the page's purpose. If 'intent' is provided, focus the description on the region most relevant to that intent.
4713
+
4714
+ DO NOT mention DOM refs, selectors, bbox coordinates, or any internal identifiers. Plain prose only. Treat the reader as someone who will issue commands like "click the Sign In button" — describe what's there in terms they can act on.
4715
+
4716
+ Call the describe_page tool with your description.`;
4717
+ const OBSERVE_TOOL = {
4718
+ name: "describe_page",
4719
+ description: "Report the natural-language description of the page.",
4720
+ parameters: {
4721
+ type: "object",
4722
+ required: ["description"],
4723
+ additionalProperties: false,
4724
+ properties: { description: {
4725
+ type: "string",
4726
+ description: "2-4 sentence prose description of the visible page state."
4727
+ } }
4728
+ }
4729
+ };
4730
+ /**
4731
+ * Produce a natural-language description of the current page state.
4732
+ * The lead model never sees the underlying snapshot.
4733
+ */
4734
+ async function observePage(snapshot, intent, signal) {
4735
+ const trimmedElements = snapshot.elements.filter((e) => e.name && e.name.length > 0).slice(0, 80).map((e) => ({
4736
+ role: e.role,
4737
+ name: e.name
4738
+ }));
4739
+ const raw = await callCompressorPublic(OBSERVE_SYSTEM, JSON.stringify({
4740
+ intent: intent ?? "",
4741
+ url: snapshot.url ?? "",
4742
+ title: snapshot.title ?? "",
4743
+ visible_text: (snapshot.text ?? "").slice(0, 4e3),
4744
+ actionable_elements: trimmedElements,
4745
+ has_visual_surfaces: Boolean(snapshot.visualSurfaces && snapshot.visualSurfaces.length > 0)
4746
+ }), OBSERVE_TOOL, signal);
4747
+ const out = {
4748
+ description: raw && typeof raw === "object" && typeof raw.description === "string" ? raw.description : "Page contents could not be described.",
4749
+ hasVisualSurfaces: Boolean(snapshot.visualSurfaces && snapshot.visualSurfaces.length > 0)
4750
+ };
4751
+ if (snapshot.url) out.url = snapshot.url;
4752
+ if (snapshot.title) out.title = snapshot.title;
4753
+ return out;
4754
+ }
4755
+
4756
+ //#endregion
4757
+ //#region src/lib/browser-mcp/planner.ts
4758
+ const PLANNER_SYSTEM = `You are a browser-automation replanner. A user issued a high-level intent that was decomposed into atomic steps. Several steps ran successfully, then one failed. You see the page state AFTER the failure and decide what to do next.
4759
+
4760
+ Your job: produce a revised list of atomic steps that will accomplish the original intent given the current page. If you cannot — the page has changed in a way that makes the intent impossible (login form vanished, navigation moved elsewhere, captcha appeared) — return an empty list and explain why in reasoning.
4761
+
4762
+ Each replanned step is a free-form natural-language intent ("the email input", "the Sign In button at the bottom of the form") plus an optional value for fill/type/select actions. Be SPECIFIC about element location ("at the bottom of the form", "in the top navigation") so the deterministic matcher cascade can resolve it without ambiguity. Do NOT reference element refs.
4763
+
4764
+ Cost rule: you get ONE call per compound failure. Make every step count.
4765
+
4766
+ Call the replan_compound tool with your answer.`;
4767
+ const PLANNER_TOOL = {
4768
+ name: "replan_compound",
4769
+ description: "Report the revised atomic steps to complete the original compound intent.",
4770
+ parameters: {
4771
+ type: "object",
4772
+ required: ["steps", "reasoning"],
4773
+ additionalProperties: false,
4774
+ properties: {
4775
+ steps: {
4776
+ type: "array",
4777
+ maxItems: 8,
4778
+ items: {
4779
+ type: "object",
4780
+ required: ["intent"],
4781
+ additionalProperties: false,
4782
+ properties: {
4783
+ intent: { type: "string" },
4784
+ value: { type: "string" }
4785
+ }
4786
+ }
4787
+ },
4788
+ reasoning: {
4789
+ type: "string",
4790
+ description: "1-2 sentence explanation of the replanning decision."
4791
+ }
4792
+ }
4793
+ }
4794
+ };
4795
+ /**
4796
+ * Run the fast-model planner on a failed compound. Returns the
4797
+ * revised step list (may be empty if the planner gives up).
4798
+ *
4799
+ * The snapshot is trimmed before sending to keep the round-trip
4800
+ * small: only element role + name + brief value/placeholder if
4801
+ * present. Bbox / state flags / frame ids would just inflate tokens
4802
+ * without helping the natural-language replanner.
4803
+ */
4804
+ async function planCompoundReplan(input, signal) {
4805
+ const trimmed = input.snapshot.elements.slice(0, 80).map((e) => {
4806
+ const out = { role: e.role };
4807
+ if (e.name) out.name = e.name;
4808
+ if (e.placeholder) out.placeholder = e.placeholder;
4809
+ if (e.value) out.value = e.value;
4810
+ return out;
4811
+ });
4812
+ const raw = await callCompressorPublic(PLANNER_SYSTEM, JSON.stringify({
4813
+ original_intent: input.originalIntent,
4814
+ original_value: input.originalValue,
4815
+ completed_steps: input.completedSteps.map((s) => ({
4816
+ intent: s.intent,
4817
+ ...s.value !== void 0 ? { value: s.value } : {}
4818
+ })),
4819
+ failed_step: {
4820
+ intent: input.failedStep.intent,
4821
+ ...input.failedStep.value !== void 0 ? { value: input.failedStep.value } : {}
4822
+ },
4823
+ failure_reason: input.failureReason,
4824
+ page_now: {
4825
+ url: input.snapshot.url ?? "",
4826
+ title: input.snapshot.title ?? "",
4827
+ visible_text: (input.snapshot.text ?? "").slice(0, 3e3),
4828
+ actionable_elements: trimmed
4829
+ }
4830
+ }), PLANNER_TOOL, signal);
4831
+ if (!raw || typeof raw !== "object") return {
4832
+ steps: [],
4833
+ reasoning: "planner returned empty response"
4834
+ };
4835
+ const obj = raw;
4836
+ const reasoning = typeof obj.reasoning === "string" ? obj.reasoning : "";
4837
+ if (!Array.isArray(obj.steps)) return {
4838
+ steps: [],
4839
+ reasoning
4840
+ };
4841
+ const steps = [];
4842
+ for (const s of obj.steps.slice(0, 8)) {
4843
+ if (!s || typeof s !== "object") continue;
4844
+ const intent = s.intent;
4845
+ const value = s.value;
4846
+ if (typeof intent === "string" && intent.length > 0) {
4847
+ const step = { intent };
4848
+ if (typeof value === "string") step.value = value;
4849
+ steps.push(step);
4850
+ }
4851
+ }
4852
+ return {
4853
+ steps,
4854
+ reasoning
4855
+ };
4856
+ }
4857
+
3937
4858
  //#endregion
3938
4859
  //#region src/lib/browser-mcp/index.ts
3939
4860
  /**
@@ -3990,7 +4911,7 @@ const BROWSER_TOOLS = Object.freeze([
3990
4911
  additionalProperties: false,
3991
4912
  properties: {}
3992
4913
  },
3993
- capability: "browser",
4914
+ capability: "browser_power",
3994
4915
  async handler(args, signal) {
3995
4916
  return dispatchBrowserTool("browser_list_tabs", args, signal);
3996
4917
  }
@@ -4031,7 +4952,7 @@ const BROWSER_TOOLS = Object.freeze([
4031
4952
  description: "Array of tab ids to close (from browser_list_tabs)."
4032
4953
  } }
4033
4954
  },
4034
- capability: "browser",
4955
+ capability: "browser_power",
4035
4956
  async handler(args, signal) {
4036
4957
  return dispatchBrowserTool("browser_close_tab", args, signal);
4037
4958
  }
@@ -4116,7 +5037,7 @@ const BROWSER_TOOLS = Object.freeze([
4116
5037
  }
4117
5038
  }
4118
5039
  },
4119
- capability: "browser",
5040
+ capability: "browser_power",
4120
5041
  async handler(args, signal) {
4121
5042
  return dispatchBrowserTool("browser_read_page", args, signal);
4122
5043
  }
@@ -4175,7 +5096,7 @@ const BROWSER_TOOLS = Object.freeze([
4175
5096
  }
4176
5097
  }
4177
5098
  },
4178
- capability: "browser",
5099
+ capability: "browser_power",
4179
5100
  async handler(args, signal) {
4180
5101
  return dispatchBrowserTool("browser_scroll", args, signal);
4181
5102
  }
@@ -4195,7 +5116,7 @@ const BROWSER_TOOLS = Object.freeze([
4195
5116
  }
4196
5117
  }
4197
5118
  },
4198
- capability: "browser",
5119
+ capability: "browser_power",
4199
5120
  async handler(args, signal) {
4200
5121
  return dispatchBrowserTool("browser_keyboard", args, signal);
4201
5122
  }
@@ -4232,7 +5153,7 @@ const BROWSER_TOOLS = Object.freeze([
4232
5153
  }
4233
5154
  }
4234
5155
  },
4235
- capability: "browser",
5156
+ capability: "browser_power",
4236
5157
  async handler(args, signal) {
4237
5158
  return dispatchBrowserTool("browser_wait", args, signal);
4238
5159
  }
@@ -4256,7 +5177,7 @@ const BROWSER_TOOLS = Object.freeze([
4256
5177
  }
4257
5178
  }
4258
5179
  },
4259
- capability: "browser",
5180
+ capability: "browser_power",
4260
5181
  async handler(args, signal) {
4261
5182
  return dispatchBrowserTool("browser_eval_js", args, signal);
4262
5183
  }
@@ -4288,7 +5209,7 @@ const BROWSER_TOOLS = Object.freeze([
4288
5209
  }
4289
5210
  }
4290
5211
  },
4291
- capability: "browser",
5212
+ capability: "browser_power",
4292
5213
  async handler(args, signal) {
4293
5214
  return dispatchBrowserTool("browser_download", args, signal);
4294
5215
  }
@@ -4352,7 +5273,7 @@ const BROWSER_TOOLS = Object.freeze([
4352
5273
  }
4353
5274
  }
4354
5275
  },
4355
- capability: "browser",
5276
+ capability: "browser_power",
4356
5277
  async handler(args, signal) {
4357
5278
  return dispatchBrowserTool("browser_mouse", args, signal);
4358
5279
  }
@@ -4426,7 +5347,7 @@ const BROWSER_TOOLS = Object.freeze([
4426
5347
  }
4427
5348
  }
4428
5349
  },
4429
- capability: "browser",
5350
+ capability: "browser_power",
4430
5351
  async handler(args, signal) {
4431
5352
  return dispatchBrowserTool("browser_drag", args, signal);
4432
5353
  }
@@ -4450,7 +5371,7 @@ const BROWSER_TOOLS = Object.freeze([
4450
5371
  }
4451
5372
  }
4452
5373
  },
4453
- capability: "browser",
5374
+ capability: "browser_power",
4454
5375
  async handler(args, signal) {
4455
5376
  return dispatchBrowserTool("browser_type", args, signal);
4456
5377
  }
@@ -4491,7 +5412,7 @@ const BROWSER_TOOLS = Object.freeze([
4491
5412
  }
4492
5413
  }
4493
5414
  },
4494
- capability: "browser",
5415
+ capability: "browser_power",
4495
5416
  async handler(args, signal) {
4496
5417
  const kind = args.kind === "network" ? "network" : "console";
4497
5418
  const tool = kind === "network" ? "browser_network_log" : "browser_console_logs";
@@ -4548,7 +5469,7 @@ const BROWSER_TOOLS = Object.freeze([
4548
5469
  }
4549
5470
  }
4550
5471
  },
4551
- capability: "browser_compound",
5472
+ capability: "browser_power",
4552
5473
  async handler(args, signal) {
4553
5474
  const tabId = typeof args.tabId === "number" ? args.tabId : void 0;
4554
5475
  const intent = typeof args.intent === "string" ? args.intent : "";
@@ -4615,65 +5536,109 @@ const BROWSER_TOOLS = Object.freeze([
4615
5536
  const value = typeof args.value === "string" ? args.value : void 0;
4616
5537
  if (!refIn && !intent) return toolEnvelope({ error: "either `ref` (REF mode) or `intent` (INTENT mode) is required" }, true);
4617
5538
  if (refIn) return dispatchActionByRef(tabId, refIn, typeof args.action === "string" ? args.action : "click", value, signal);
4618
- const snapshot = await fetchSnapshot(tabId, signal);
4619
- const picked = await pickElement(snapshot, intent, signal, value);
4620
- if (!picked.ref || picked.confidence < .5) {
4621
- const surfaces = snapshot.visualSurfaces;
4622
- if (surfaces && surfaces.length > 0) {
4623
- const shotEnv = await dispatchBrowserTool("browser_screenshot", {
4624
- tabId,
4625
- format: "png"
5539
+ const decomposed = decompose(intent, value);
5540
+ if (decomposed.steps.length === 1) return runAtomicIntentStep(tabId, decomposed.steps[0].intent, decomposed.steps[0].value, signal);
5541
+ const summaries = [];
5542
+ let navigated = false;
5543
+ const completedSteps = [];
5544
+ for (let i = 0; i < decomposed.steps.length; i++) {
5545
+ const step = decomposed.steps[i];
5546
+ const env = await runAtomicIntentStep(tabId, step.intent, step.value, signal);
5547
+ const stepText = env.content?.[0]?.text;
5548
+ let stepResult = {};
5549
+ if (typeof stepText === "string") try {
5550
+ stepResult = JSON.parse(stepText);
5551
+ } catch {}
5552
+ if (env.isError || stepResult.ok === false) try {
5553
+ const failureReason = String(stepResult.error ?? "unknown");
5554
+ const replan = await planCompoundReplan({
5555
+ originalIntent: intent,
5556
+ originalValue: value,
5557
+ completedSteps,
5558
+ failedStep: step,
5559
+ failureReason,
5560
+ snapshot: await fetchSnapshot(tabId, signal)
4626
5561
  }, signal);
4627
- if (shotEnv.isError) return toolEnvelope({
5562
+ if (replan.steps.length === 0) return toolEnvelope({
4628
5563
  ok: false,
4629
- error: "no text match; screenshot for visual fallback failed",
4630
- picked
5564
+ summary: `compound step ${i + 1}/${decomposed.steps.length} failed and planner declined: ${replan.reasoning || failureReason}`,
5565
+ template: decomposed.template,
5566
+ steps_completed: i,
5567
+ failed_step: step.intent,
5568
+ planner_reasoning: replan.reasoning
4631
5569
  }, true);
4632
- const shotText = shotEnv.content?.[0]?.text;
4633
- let shot = {};
4634
- try {
4635
- shot = shotText ? JSON.parse(shotText) : {};
4636
- } catch {
4637
- return toolEnvelope({
5570
+ const replanSummaries = [];
5571
+ for (let j = 0; j < replan.steps.length; j++) {
5572
+ const rstep = replan.steps[j];
5573
+ const renv = await runAtomicIntentStep(tabId, rstep.intent, rstep.value, signal);
5574
+ const rtext = renv.content?.[0]?.text;
5575
+ let rresult = {};
5576
+ if (typeof rtext === "string") try {
5577
+ rresult = JSON.parse(rtext);
5578
+ } catch {}
5579
+ if (renv.isError || rresult.ok === false) return toolEnvelope({
4638
5580
  ok: false,
4639
- error: "no text match; screenshot envelope unparseable"
5581
+ summary: `compound failed at original step ${i + 1}, planner replan also failed at step ${j + 1}/${replan.steps.length}: ${String(rresult.error ?? "unknown")}`,
5582
+ template: decomposed.template,
5583
+ steps_completed: i,
5584
+ failed_step: rstep.intent,
5585
+ planner_reasoning: replan.reasoning
4640
5586
  }, true);
5587
+ if (typeof rresult.action_taken === "string") replanSummaries.push(`${rresult.action_taken} (${rstep.intent})`);
5588
+ if (rresult.navigated === true) navigated = true;
4641
5589
  }
4642
- if (!shot.contentType || !shot.dataBase64) return toolEnvelope({
4643
- ok: false,
4644
- error: "no text match; screenshot envelope missing fields"
4645
- }, true);
4646
- const visual = await pickElementVisual(shot.dataBase64, shot.contentType, intent, surfaces, signal);
4647
- if (visual.confidence < .5) return toolEnvelope({
4648
- ok: false,
4649
- error: "no element matched intent (text + visual)",
4650
- picked,
4651
- visual
4652
- }, true);
4653
- const clickEnv = await dispatchBrowserTool("browser_mouse", {
4654
- tabId,
4655
- action: "click",
4656
- x: visual.x,
4657
- y: visual.y,
4658
- force: true
4659
- }, signal);
4660
- if (clickEnv.isError) return clickEnv;
4661
5590
  return toolEnvelope({
4662
5591
  ok: true,
4663
- action_taken: "click_visual",
4664
- x: visual.x,
4665
- y: visual.y,
4666
- confidence: visual.confidence,
4667
- reason: visual.reason
5592
+ summary: `compound recovered via planner (${replan.reasoning}): ${replanSummaries.join("")}`,
5593
+ template: decomposed.template,
5594
+ steps_completed: i + replan.steps.length,
5595
+ navigated,
5596
+ planner_used: true,
5597
+ planner_reasoning: replan.reasoning
4668
5598
  });
5599
+ } catch (replanErr) {
5600
+ return toolEnvelope({
5601
+ ok: false,
5602
+ summary: `compound step ${i + 1}/${decomposed.steps.length} failed; planner errored: ${replanErr instanceof Error ? replanErr.message : String(replanErr)}`,
5603
+ template: decomposed.template,
5604
+ steps_completed: i,
5605
+ failed_step: step.intent
5606
+ }, true);
5607
+ }
5608
+ if (typeof stepResult.action_taken === "string") summaries.push(`${stepResult.action_taken} (${step.intent})`);
5609
+ if (stepResult.navigated === true) navigated = true;
5610
+ completedSteps.push(step);
5611
+ }
5612
+ return toolEnvelope({
5613
+ ok: true,
5614
+ summary: decomposed.successSummary ?? summaries.join(" → "),
5615
+ template: decomposed.template,
5616
+ steps_completed: decomposed.steps.length,
5617
+ navigated
5618
+ });
5619
+ }
5620
+ },
5621
+ {
5622
+ toolNameHttp: "browser_observe",
5623
+ description: "Get a natural-language description of the current page's user-actionable state — what forms, buttons, links, and content sections are visible — in 2-4 sentences. Optional `intent` focuses the description on a region ('describe the login form', 'what's in the comments section'). Use this BEFORE browser_act when you don't know what's on the page, or AFTER navigation to confirm the page loaded. Cheaper than screenshots when text is enough. Does not include canvas/SVG content — those surface as a `hasVisualSurfaces` flag; switch to browser_screenshot for visuals.",
5624
+ inputSchema: {
5625
+ type: "object",
5626
+ required: ["tabId"],
5627
+ additionalProperties: false,
5628
+ properties: {
5629
+ tabId: { type: "number" },
5630
+ intent: {
5631
+ type: "string",
5632
+ description: "Optional natural-language focus ('describe the form', 'what's in the sidebar')."
4669
5633
  }
4670
- return toolEnvelope({
4671
- ok: false,
4672
- error: "no element matched intent",
4673
- picked
4674
- }, true);
4675
5634
  }
4676
- return dispatchActionByRef(tabId, picked.ref, picked.action, picked.value ?? value, signal);
5635
+ },
5636
+ capability: "browser_compound",
5637
+ async handler(args, signal) {
5638
+ const tabId = typeof args.tabId === "number" ? args.tabId : void 0;
5639
+ const intent = typeof args.intent === "string" ? args.intent : void 0;
5640
+ if (!tabId) return toolEnvelope({ error: "tabId required" }, true);
5641
+ return toolEnvelope(await observePage(await fetchSnapshot(tabId, signal), intent, signal));
4677
5642
  }
4678
5643
  },
4679
5644
  {
@@ -4716,6 +5681,76 @@ const BROWSER_TOOLS = Object.freeze([
4716
5681
  }
4717
5682
  ]);
4718
5683
  /**
5684
+ * Run a single atomic intent step: fetch snapshot, run matcher
5685
+ * cascade (via pickElement), visual fallback on no-match, dispatch
5686
+ * the resolved action. Returns the standard MCP envelope.
5687
+ *
5688
+ * Pulled out of `browser_act`'s handler so the compound-intent loop
5689
+ * (decompose path) can call it per-step without duplicating the
5690
+ * snapshot + visual-fallback logic.
5691
+ */
5692
+ async function runAtomicIntentStep(tabId, intent, value, signal) {
5693
+ const snapshot = await fetchSnapshot(tabId, signal);
5694
+ const picked = await pickElement(snapshot, intent, signal, value);
5695
+ if (!picked.ref || picked.confidence < .5) {
5696
+ const surfaces = snapshot.visualSurfaces;
5697
+ if (surfaces && surfaces.length > 0) {
5698
+ const shotEnv = await dispatchBrowserTool("browser_screenshot", {
5699
+ tabId,
5700
+ format: "png"
5701
+ }, signal);
5702
+ if (shotEnv.isError) return toolEnvelope({
5703
+ ok: false,
5704
+ error: "no text match; screenshot for visual fallback failed",
5705
+ picked
5706
+ }, true);
5707
+ const shotText = shotEnv.content?.[0]?.text;
5708
+ let shot = {};
5709
+ try {
5710
+ shot = shotText ? JSON.parse(shotText) : {};
5711
+ } catch {
5712
+ return toolEnvelope({
5713
+ ok: false,
5714
+ error: "no text match; screenshot envelope unparseable"
5715
+ }, true);
5716
+ }
5717
+ if (!shot.contentType || !shot.dataBase64) return toolEnvelope({
5718
+ ok: false,
5719
+ error: "no text match; screenshot envelope missing fields"
5720
+ }, true);
5721
+ const visual = await pickElementVisual(shot.dataBase64, shot.contentType, intent, surfaces, signal);
5722
+ if (visual.confidence < .5) return toolEnvelope({
5723
+ ok: false,
5724
+ error: "no element matched intent (text + visual)",
5725
+ picked,
5726
+ visual
5727
+ }, true);
5728
+ const clickEnv = await dispatchBrowserTool("browser_mouse", {
5729
+ tabId,
5730
+ action: "click",
5731
+ x: visual.x,
5732
+ y: visual.y,
5733
+ force: true
5734
+ }, signal);
5735
+ if (clickEnv.isError) return clickEnv;
5736
+ return toolEnvelope({
5737
+ ok: true,
5738
+ action_taken: "click_visual",
5739
+ x: visual.x,
5740
+ y: visual.y,
5741
+ confidence: visual.confidence,
5742
+ reason: visual.reason
5743
+ });
5744
+ }
5745
+ return toolEnvelope({
5746
+ ok: false,
5747
+ error: "no element matched intent",
5748
+ picked
5749
+ }, true);
5750
+ }
5751
+ return dispatchActionByRef(tabId, picked.ref, picked.action, picked.value ?? value, signal);
5752
+ }
5753
+ /**
4719
5754
  * Dispatch an action against a known ref via the appropriate primitive.
4720
5755
  * Shared between REF mode and INTENT-mode-text-match in `browser_act`.
4721
5756
  * Returns an MCP envelope (text content + optional isError).
@@ -7309,6 +8344,27 @@ function workerToolsEnabled() {
7309
8344
  function browserCompoundToolsEnabled() {
7310
8345
  return compressorAvailable();
7311
8346
  }
8347
+ /**
8348
+ * Gate for the L0/L1 power browser tools (`browser_read_page`,
8349
+ * `browser_mouse`, `browser_drag`, `browser_type`, `browser_keyboard`,
8350
+ * `browser_scroll`, `browser_eval_js`, `browser_diagnostics`,
8351
+ * `browser_find`, `browser_close_tab`, `browser_list_tabs`,
8352
+ * `browser_wait`, `browser_download`).
8353
+ *
8354
+ * Returns true iff `state.powerBrowseEnabled` (set by `--power-browse`
8355
+ * or `GH_ROUTER_ENABLE_POWER_BROWSE=1`). When off, the default
8356
+ * `--browse` surface exposes only the 6 lead-model tools (`act`,
8357
+ * `observe`, `extract`, `navigate`, `screenshot`, `open_tab`) that
8358
+ * hide DOM details behind intent. Power mode adds the raw primitives
8359
+ * for users who want direct coord/keystroke control.
8360
+ *
8361
+ * `handler.ts` filter chain ANDs this with `browserToolsEnabled()`
8362
+ * (defense-in-depth — power without basic is meaningless and the
8363
+ * setup path already forces basic on when power is on).
8364
+ */
8365
+ function browserPowerToolsEnabled() {
8366
+ return state.powerBrowseEnabled === true;
8367
+ }
7312
8368
 
7313
8369
  //#endregion
7314
8370
  //#region src/routes/mcp/handler.ts
@@ -7486,6 +8542,7 @@ function toolEntries() {
7486
8542
  if (t.capability === "stand_in") return standInToolEnabled();
7487
8543
  if (t.capability === "browser") return browserToolsEnabled();
7488
8544
  if (t.capability === "browser_compound") return browserToolsEnabled() && browserCompoundToolsEnabled();
8545
+ if (t.capability === "browser_power") return browserToolsEnabled() && browserPowerToolsEnabled();
7489
8546
  return true;
7490
8547
  }).map((t) => ({
7491
8548
  name: t.toolNameHttp,
@@ -7778,6 +8835,7 @@ async function handleToolsCall(body) {
7778
8835
  if (nonPersonaTool && nonPersonaTool.capability === "stand_in" && !standInToolEnabled()) return rpcError(body.id, RPC_METHOD_NOT_FOUND, `tools/call: unknown tool "${name$1}"`);
7779
8836
  if (nonPersonaTool && nonPersonaTool.capability === "browser" && !browserToolsEnabled()) return rpcError(body.id, RPC_METHOD_NOT_FOUND, `tools/call: unknown tool "${name$1}"`);
7780
8837
  if (nonPersonaTool && nonPersonaTool.capability === "browser_compound" && !(browserToolsEnabled() && browserCompoundToolsEnabled())) return rpcError(body.id, RPC_METHOD_NOT_FOUND, `tools/call: unknown tool "${name$1}"`);
8838
+ if (nonPersonaTool && nonPersonaTool.capability === "browser_power" && !(browserToolsEnabled() && browserPowerToolsEnabled())) return rpcError(body.id, RPC_METHOD_NOT_FOUND, `tools/call: unknown tool "${name$1}"`);
7781
8839
  let personaPrompt;
7782
8840
  let personaContext;
7783
8841
  let personaEffort;
@@ -11331,7 +12389,10 @@ function buildPeerAwarenessSnippet(opts) {
11331
12389
  if (opts.workerToolsAvailable) para2Parts.push("`worker_explore` runs a Gemini-backed read-only worker that returns a summary, using its own context rather than yours; concurrent launches share the `MAX_INFLIGHT_TOOLS_CALL=8` cap with operator traffic.", "`worker_implement` is the same worker with edit/write/bash; `worktree: true` runs it in an isolated git worktree and returns the diff.", "Workers themselves have `code_search` in their toolset.");
11332
12390
  para2Parts.push("`web_search` surfaces citable sources for docs, errors, and upstream issues.");
11333
12391
  if (opts.standInAvailable) para2Parts.push("`stand_in` provides three-lab consensus for decision tiebreak when the user is unavailable.");
11334
- if (opts.browseAvailable) para2Parts.push("`browser_*` tools (under `mcp__gh-router-peers__browser_*`) drive a real Chrome / Edge browser via a local extension; prefer the L2 compound tools `browser_act(intent | ref, value?)` / `browser_find(intent)` / `browser_extract(schema, instruction)` over the L0/L1 primitives.");
12392
+ if (opts.browseAvailable) {
12393
+ const powerNote = opts.powerBrowseAvailable ? " Power mode is on: the L0/L1 primitives (`browser_mouse`, `browser_drag`, `browser_type`, `browser_keyboard`, `browser_scroll`, `browser_eval_js`, `browser_read_page`, `browser_diagnostics`, `browser_find`) are also available for direct DOM / coordinate control." : "";
12394
+ para2Parts.push(`\`browser_*\` tools (under \`mcp__gh-router-peers__browser_*\`) drive a real Chrome / Edge browser via a local extension. Lead surface: \`browser_act(intent, value?)\` for any click / fill / type / scroll-to (an inner fast model resolves intent), \`browser_observe(intent?)\` for a 2-4 sentence natural-language page description, \`browser_extract(schema, instruction)\` for typed extraction, \`browser_navigate\` / \`browser_open_tab\` / \`browser_screenshot\` for state and visuals. The lead model never sees raw DOM: refs, bboxes, and role/name dumps stay internal.${powerNote}`);
12395
+ }
11335
12396
  return [
11336
12397
  "## Peer review and advisor",
11337
12398
  "",
@@ -12794,7 +13855,7 @@ function initProxyFromEnv() {
12794
13855
  //#endregion
12795
13856
  //#region package.json
12796
13857
  var name = "github-router";
12797
- var version$1 = "0.3.52";
13858
+ var version$1 = "0.3.66";
12798
13859
 
12799
13860
  //#endregion
12800
13861
  //#region src/lib/approval.ts
@@ -14516,6 +15577,11 @@ async function setupAndServe(options) {
14516
15577
  state.showToken = options.showToken;
14517
15578
  state.extendedBetas = options.extendedBetas;
14518
15579
  state.browseEnabled = options.browseEnabled || process.env.GH_ROUTER_ENABLE_BROWSE === "1";
15580
+ state.powerBrowseEnabled = options.powerBrowseEnabled || process.env.GH_ROUTER_ENABLE_POWER_BROWSE === "1";
15581
+ if (state.powerBrowseEnabled) state.browseEnabled = true;
15582
+ if (process.env.GH_ROUTER_BROWSER_NO_HUMANLIKE === "1") state.humanlikeForce = "off";
15583
+ else if (options.humanlikeEnabled || process.env.GH_ROUTER_HUMANLIKE === "1") state.humanlikeForce = "on";
15584
+ else state.humanlikeForce = "auto";
14519
15585
  if (process.env.COPILOT_API_URL) state.copilotApiUrl = process.env.COPILOT_API_URL;
14520
15586
  await ensurePaths();
14521
15587
  await cacheVSCodeVersion();
@@ -14623,6 +15689,16 @@ const sharedServerArgs = {
14623
15689
  type: "boolean",
14624
15690
  default: false,
14625
15691
  description: "Enable the browser-control MCP tools (browser_open_tab, browser_screenshot, browser_click, etc.) on /mcp. Requires Chrome or Edge installed; the bundled extension must be loaded on first tool call (the proxy returns install_required with Web Store URLs + a Load Unpacked fallback path). Off by default; can also be enabled with GH_ROUTER_ENABLE_BROWSE=1."
15692
+ },
15693
+ "power-browse": {
15694
+ type: "boolean",
15695
+ default: false,
15696
+ description: "Expose the full ~18-tool browser MCP surface (raw read_page, mouse / drag / scroll / keyboard / type primitives, eval_js, diagnostics, find, locate). Default --browse exposes only the 6 lead-model tools (act, observe, extract, navigate, screenshot, open_tab) that hide DOM details behind intent. Implies --browse. Off by default; can also be enabled with GH_ROUTER_ENABLE_POWER_BROWSE=1."
15697
+ },
15698
+ humanlike: {
15699
+ type: "boolean",
15700
+ default: false,
15701
+ description: "Force humanlike pacing on ALL browser tool dispatches: Beta-distributed inter-action delays (800-4600 ms), Bezier mouse trajectories with overshoot-and-correct, per-keystroke jitter with word-end pauses, scroll chunking. Use for known anti-bot sites (Cloudflare, Datadome). Off by default (auto mode); GH_ROUTER_HUMANLIKE=1 is the env equivalent. GH_ROUTER_BROWSER_NO_HUMANLIKE=1 hard-disables (wins over --humanlike, for tests)."
14626
15702
  }
14627
15703
  };
14628
15704
  const allowedAccountTypes = new Set([
@@ -14660,7 +15736,9 @@ function parseSharedArgs(args) {
14660
15736
  showToken: args["show-token"],
14661
15737
  proxyEnv: args["proxy-env"],
14662
15738
  extendedBetas: args["extended-betas"],
14663
- browseEnabled: args.browse
15739
+ browseEnabled: args.browse,
15740
+ powerBrowseEnabled: args["power-browse"],
15741
+ humanlikeEnabled: args.humanlike
14664
15742
  };
14665
15743
  }
14666
15744
  /**
@@ -14900,7 +15978,8 @@ const claude = defineCommand({
14900
15978
  geminiAvailable: geminiAvailable$1,
14901
15979
  workerToolsAvailable: workerToolsEnabled(),
14902
15980
  standInAvailable: standInToolEnabled(),
14903
- browseAvailable: state.browseEnabled
15981
+ browseAvailable: state.browseEnabled,
15982
+ powerBrowseAvailable: state.powerBrowseEnabled
14904
15983
  });
14905
15984
  extraArgs.push("--append-system-prompt", peerSnippet);
14906
15985
  try {