github-router 0.3.52 → 0.3.66
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser-bridge/index.js +17 -1
- package/dist/browser-ext/background.js +358 -1
- package/dist/browser-ext/manifest.json +2 -1
- package/dist/browser-ext/snapshot-cdp.js +438 -0
- package/dist/browser-ext/snapshot.js +101 -0
- package/dist/main.js +1148 -69
- package/dist/main.js.map +1 -1
- package/package.json +18 -18
package/dist/main.js
CHANGED
|
@@ -45,6 +45,8 @@ const state = {
|
|
|
45
45
|
showToken: false,
|
|
46
46
|
extendedBetas: false,
|
|
47
47
|
browseEnabled: false,
|
|
48
|
+
powerBrowseEnabled: false,
|
|
49
|
+
humanlikeForce: "auto",
|
|
48
50
|
sessionId: randomUUID(),
|
|
49
51
|
machineId: randomBytes(32).toString("hex")
|
|
50
52
|
};
|
|
@@ -3076,6 +3078,58 @@ function installRequiredToolResult(payload) {
|
|
|
3076
3078
|
};
|
|
3077
3079
|
}
|
|
3078
3080
|
|
|
3081
|
+
//#endregion
|
|
3082
|
+
//#region src/lib/browser-mcp/humanlike.ts
|
|
3083
|
+
/**
|
|
3084
|
+
* Sample from a Beta(2, 5) distribution scaled to [minMs, maxMs].
|
|
3085
|
+
* The Beta(2, 5) shape has its mode near 0.2 of the range — humans
|
|
3086
|
+
* follow most actions quickly, with an occasional long pause. We do
|
|
3087
|
+
* NOT use uniform random because that would produce robotically-
|
|
3088
|
+
* even spacing detectable by behavioral analysis.
|
|
3089
|
+
*
|
|
3090
|
+
* Implementation: two gamma-distributed samples via the Marsaglia /
|
|
3091
|
+
* Tsang squeeze method (Box-Muller-style sufficiency for shape ≥ 2).
|
|
3092
|
+
*/
|
|
3093
|
+
function betaDelay(minMs, maxMs) {
|
|
3094
|
+
const a = gammaSample(2);
|
|
3095
|
+
const beta = a / (a + gammaSample(5));
|
|
3096
|
+
return Math.round(minMs + beta * (maxMs - minMs));
|
|
3097
|
+
}
|
|
3098
|
+
function gammaSample(shape) {
|
|
3099
|
+
const d = shape - 1 / 3;
|
|
3100
|
+
const c = 1 / Math.sqrt(9 * d);
|
|
3101
|
+
while (true) {
|
|
3102
|
+
let x, v;
|
|
3103
|
+
do {
|
|
3104
|
+
x = normalSample();
|
|
3105
|
+
v = 1 + c * x;
|
|
3106
|
+
} while (v <= 0);
|
|
3107
|
+
v = v * v * v;
|
|
3108
|
+
const u = Math.random();
|
|
3109
|
+
if (u < 1 - .0331 * x * x * x * x) return d * v;
|
|
3110
|
+
if (Math.log(u) < .5 * x * x + d * (1 - v + Math.log(v))) return d * v;
|
|
3111
|
+
}
|
|
3112
|
+
}
|
|
3113
|
+
function normalSample() {
|
|
3114
|
+
let u = 0, v = 0;
|
|
3115
|
+
while (u === 0) u = Math.random();
|
|
3116
|
+
while (v === 0) v = Math.random();
|
|
3117
|
+
return Math.sqrt(-2 * Math.log(u)) * Math.cos(2 * Math.PI * v);
|
|
3118
|
+
}
|
|
3119
|
+
/**
|
|
3120
|
+
* Inter-action delay when paced mode is on. Returns a Beta-shaped
|
|
3121
|
+
* randomized delay in [800, 4600] ms with a base of 600 ms so the
|
|
3122
|
+
* minimum is never "too fast." Humans take 800-2800 ms between
|
|
3123
|
+
* UI actions on average, with a tail of long pauses; this matches.
|
|
3124
|
+
*
|
|
3125
|
+
* Caller is expected to subtract the time already burned in the
|
|
3126
|
+
* compound pipeline (snapshot fetch + matcher cascade) so the user-
|
|
3127
|
+
* perceived delay isn't doubled.
|
|
3128
|
+
*/
|
|
3129
|
+
function interActionDelay() {
|
|
3130
|
+
return betaDelay(800, 4600);
|
|
3131
|
+
}
|
|
3132
|
+
|
|
3079
3133
|
//#endregion
|
|
3080
3134
|
//#region src/lib/browser-mcp/policy.ts
|
|
3081
3135
|
const BLOCKED_URL_RE = /^(chrome|edge|brave|opera|vivaldi):\/\/(settings|preferences|extensions|policy|management|password|flags|flag-descriptions)/i;
|
|
@@ -3110,6 +3164,78 @@ function preflightUrlPolicy(toolName, args) {
|
|
|
3110
3164
|
|
|
3111
3165
|
//#endregion
|
|
3112
3166
|
//#region src/lib/browser-mcp/dispatch.ts
|
|
3167
|
+
/**
|
|
3168
|
+
* Tools whose dispatch counts as a mutating user action for pacing
|
|
3169
|
+
* purposes. Read-only tools (list_tabs, screenshot, read_page,
|
|
3170
|
+
* diagnostics, navigate-without-form-submit) skip the inter-action
|
|
3171
|
+
* delay because they don't look like a human clicking around.
|
|
3172
|
+
*/
|
|
3173
|
+
const PACED_TOOLS = new Set([
|
|
3174
|
+
"browser_click",
|
|
3175
|
+
"browser_fill",
|
|
3176
|
+
"browser_type",
|
|
3177
|
+
"browser_keyboard",
|
|
3178
|
+
"browser_scroll",
|
|
3179
|
+
"browser_mouse",
|
|
3180
|
+
"browser_drag"
|
|
3181
|
+
]);
|
|
3182
|
+
let lastDispatchAt = 0;
|
|
3183
|
+
let humanlikeAutoCache = {
|
|
3184
|
+
fetchedAt: 0,
|
|
3185
|
+
tabs: /* @__PURE__ */ new Set()
|
|
3186
|
+
};
|
|
3187
|
+
const HUMANLIKE_PROBE_INTERVAL_MS = 5e3;
|
|
3188
|
+
async function isHumanlikeAutoOn(tabId, signal) {
|
|
3189
|
+
if (state.humanlikeForce === "off") return false;
|
|
3190
|
+
if (typeof tabId !== "number") return false;
|
|
3191
|
+
const now = Date.now();
|
|
3192
|
+
if (now - humanlikeAutoCache.fetchedAt > HUMANLIKE_PROBE_INTERVAL_MS) try {
|
|
3193
|
+
const ready = await ensureBridgeReady();
|
|
3194
|
+
if (ready.install_required) return false;
|
|
3195
|
+
const res = await fetch(`http://127.0.0.1:${ready.port}/health`, {
|
|
3196
|
+
headers: { authorization: `Bearer ${ready.token}` },
|
|
3197
|
+
signal
|
|
3198
|
+
});
|
|
3199
|
+
if (res.ok) {
|
|
3200
|
+
const body = await res.json();
|
|
3201
|
+
const tabs = /* @__PURE__ */ new Set();
|
|
3202
|
+
for (const t of body.humanlike_tabs ?? []) if (typeof t.tabId === "number") tabs.add(t.tabId);
|
|
3203
|
+
humanlikeAutoCache = {
|
|
3204
|
+
fetchedAt: now,
|
|
3205
|
+
tabs
|
|
3206
|
+
};
|
|
3207
|
+
}
|
|
3208
|
+
} catch {}
|
|
3209
|
+
return humanlikeAutoCache.tabs.has(tabId);
|
|
3210
|
+
}
|
|
3211
|
+
async function maybeInjectHumanlikeDelay(tool, signal, tabId) {
|
|
3212
|
+
if (!PACED_TOOLS.has(tool)) return;
|
|
3213
|
+
let on = state.humanlikeForce === "on";
|
|
3214
|
+
if (!on && state.humanlikeForce === "auto") on = await isHumanlikeAutoOn(tabId, signal);
|
|
3215
|
+
if (!on) return;
|
|
3216
|
+
const target = interActionDelay();
|
|
3217
|
+
const sinceLast = Date.now() - lastDispatchAt;
|
|
3218
|
+
const wait = Math.max(0, target - sinceLast);
|
|
3219
|
+
if (wait > 0) await sleepAbortable(wait, signal);
|
|
3220
|
+
lastDispatchAt = Date.now();
|
|
3221
|
+
}
|
|
3222
|
+
function sleepAbortable(ms, signal) {
|
|
3223
|
+
return new Promise((resolve, reject) => {
|
|
3224
|
+
if (signal?.aborted) {
|
|
3225
|
+
reject(/* @__PURE__ */ new Error("aborted"));
|
|
3226
|
+
return;
|
|
3227
|
+
}
|
|
3228
|
+
const timer = setTimeout(() => {
|
|
3229
|
+
if (signal) signal.removeEventListener("abort", onAbort);
|
|
3230
|
+
resolve();
|
|
3231
|
+
}, ms);
|
|
3232
|
+
const onAbort = () => {
|
|
3233
|
+
clearTimeout(timer);
|
|
3234
|
+
reject(/* @__PURE__ */ new Error("aborted"));
|
|
3235
|
+
};
|
|
3236
|
+
if (signal) signal.addEventListener("abort", onAbort, { once: true });
|
|
3237
|
+
});
|
|
3238
|
+
}
|
|
3113
3239
|
const PER_TOOL_TIMEOUTS = {
|
|
3114
3240
|
browser_list_tabs: {
|
|
3115
3241
|
defaultMs: 5e3,
|
|
@@ -3275,6 +3401,7 @@ async function dispatchBrowserTool(tool, args, signal, opts = {}) {
|
|
|
3275
3401
|
};
|
|
3276
3402
|
const ready = await ensureBridgeReady();
|
|
3277
3403
|
if (ready.install_required) return installRequiredToolResult(ready);
|
|
3404
|
+
await maybeInjectHumanlikeDelay(tool, signal, typeof args.tabId === "number" ? args.tabId : void 0);
|
|
3278
3405
|
const { defaultMs, maxMs } = pickTimeout(tool);
|
|
3279
3406
|
const callerTimeout = typeof opts.timeoutMs === "number" && opts.timeoutMs > 0 ? Math.min(opts.timeoutMs, maxMs) : defaultMs;
|
|
3280
3407
|
try {
|
|
@@ -3355,6 +3482,517 @@ function logAudit$1(record) {
|
|
|
3355
3482
|
})();
|
|
3356
3483
|
}
|
|
3357
3484
|
|
|
3485
|
+
//#endregion
|
|
3486
|
+
//#region src/lib/browser-mcp/matcher.ts
|
|
3487
|
+
/**
|
|
3488
|
+
* Resolve an intent to an action. Synchronous, no I/O, <5ms expected
|
|
3489
|
+
* on a 200-element snapshot.
|
|
3490
|
+
*
|
|
3491
|
+
* Returns `{source: "escalate"}` when no layer produced a single
|
|
3492
|
+
* confident candidate. Caller is expected to invoke the fast-model
|
|
3493
|
+
* fallback path with the returned `candidates` shortlist (smaller
|
|
3494
|
+
* than the full snapshot, so fast-model token cost drops 3-5×).
|
|
3495
|
+
*/
|
|
3496
|
+
function deterministicResolve(snapshot, parsed, value) {
|
|
3497
|
+
const v = value ?? parsed.valueFromIntent;
|
|
3498
|
+
const allCandidates = [];
|
|
3499
|
+
for (const layer of LAYERS) {
|
|
3500
|
+
const found = layer.run(snapshot, parsed, v);
|
|
3501
|
+
if (found.length === 0) continue;
|
|
3502
|
+
allCandidates.push(...found);
|
|
3503
|
+
const winners = applyTieBreakers(found, parsed);
|
|
3504
|
+
const top = winners[0];
|
|
3505
|
+
if (!top) continue;
|
|
3506
|
+
const runnerUp = winners[1];
|
|
3507
|
+
if (top.score >= layer.floor && (!runnerUp || top.score - runnerUp.score >= .15)) {
|
|
3508
|
+
const action = inferActionLocal(top.el.role, parsed, v);
|
|
3509
|
+
return {
|
|
3510
|
+
ref: top.el.ref,
|
|
3511
|
+
action,
|
|
3512
|
+
...needsValue(action) && v !== void 0 ? { value: v } : {},
|
|
3513
|
+
confidence: top.score,
|
|
3514
|
+
source: layer.name,
|
|
3515
|
+
reason: top.reason
|
|
3516
|
+
};
|
|
3517
|
+
}
|
|
3518
|
+
}
|
|
3519
|
+
const shortlist = dedupeAndRank(allCandidates).slice(0, 8);
|
|
3520
|
+
return {
|
|
3521
|
+
ref: "",
|
|
3522
|
+
action: parsed.verb ?? "click",
|
|
3523
|
+
...v !== void 0 ? { value: v } : {},
|
|
3524
|
+
confidence: 0,
|
|
3525
|
+
source: "escalate",
|
|
3526
|
+
reason: shortlist.length === 0 ? "no candidates from any cascade layer" : `${shortlist.length} ambiguous candidates`,
|
|
3527
|
+
candidates: shortlist.map((c) => ({
|
|
3528
|
+
ref: c.el.ref,
|
|
3529
|
+
score: c.score,
|
|
3530
|
+
layer: c.layer
|
|
3531
|
+
}))
|
|
3532
|
+
};
|
|
3533
|
+
}
|
|
3534
|
+
function applyTieBreakers(cands, parsed) {
|
|
3535
|
+
const verb = parsed.verb ?? "click";
|
|
3536
|
+
const dropDisabled = verb === "click" || verb === "fill" || verb === "type" || verb === "select";
|
|
3537
|
+
return cands.filter((c) => {
|
|
3538
|
+
if (c.el.hidden) return false;
|
|
3539
|
+
if (c.el.bbox && (c.el.bbox[2] < 4 || c.el.bbox[3] < 4)) return false;
|
|
3540
|
+
if (dropDisabled && c.el.disabled) return false;
|
|
3541
|
+
return true;
|
|
3542
|
+
}).map((c) => ({
|
|
3543
|
+
...c,
|
|
3544
|
+
score: c.score * weight(c, verb)
|
|
3545
|
+
})).sort((a, b) => b.score - a.score);
|
|
3546
|
+
}
|
|
3547
|
+
function weight(c, verb) {
|
|
3548
|
+
let w = 1;
|
|
3549
|
+
const bbox = c.el.bbox;
|
|
3550
|
+
if (bbox) {
|
|
3551
|
+
if (!(bbox[0] >= 0 && bbox[1] >= 0)) w *= .92;
|
|
3552
|
+
}
|
|
3553
|
+
if (c.el.isInIframe) w *= .95;
|
|
3554
|
+
if (verb === "click") {
|
|
3555
|
+
const r = (c.el.role || "").toLowerCase();
|
|
3556
|
+
if (r === "button") w *= 1;
|
|
3557
|
+
else if (r === "link" || r === "a") w *= .98;
|
|
3558
|
+
else if (r === "menuitem") w *= .96;
|
|
3559
|
+
else if (r === "generic" || r === "div" || r === "span") w *= .9;
|
|
3560
|
+
}
|
|
3561
|
+
return Math.min(1, w);
|
|
3562
|
+
}
|
|
3563
|
+
function dedupeAndRank(cands) {
|
|
3564
|
+
const byRef = /* @__PURE__ */ new Map();
|
|
3565
|
+
for (const c of cands) {
|
|
3566
|
+
const existing = byRef.get(c.el.ref);
|
|
3567
|
+
if (!existing || existing.score < c.score) byRef.set(c.el.ref, c);
|
|
3568
|
+
}
|
|
3569
|
+
return [...byRef.values()].sort((a, b) => b.score - a.score);
|
|
3570
|
+
}
|
|
3571
|
+
function inferActionLocal(role, parsed, value) {
|
|
3572
|
+
if (parsed.verb === "scroll_into_view") return "scroll_into_view";
|
|
3573
|
+
const intentLower = parsed.rawTarget.toLowerCase();
|
|
3574
|
+
if (/\bscroll\b/.test(intentLower)) return "scroll_into_view";
|
|
3575
|
+
const r = (role || "").toLowerCase();
|
|
3576
|
+
if (r === "select" || r === "combobox") return "select";
|
|
3577
|
+
if (r === "textarea" || r === "input" || r === "textbox" || r === "searchbox" || r === "spinbutton") {
|
|
3578
|
+
if (parsed.verb === "type") return "type";
|
|
3579
|
+
if (parsed.verb === "fill") return "fill";
|
|
3580
|
+
return value !== void 0 ? "fill" : "click";
|
|
3581
|
+
}
|
|
3582
|
+
return parsed.verb ?? "click";
|
|
3583
|
+
}
|
|
3584
|
+
function needsValue(action) {
|
|
3585
|
+
return action === "fill" || action === "type" || action === "select";
|
|
3586
|
+
}
|
|
3587
|
+
function nameOf(el) {
|
|
3588
|
+
return (el.name ?? "").trim();
|
|
3589
|
+
}
|
|
3590
|
+
function nameLowerOf(el) {
|
|
3591
|
+
return nameOf(el).toLowerCase();
|
|
3592
|
+
}
|
|
3593
|
+
function isClickableRole(role) {
|
|
3594
|
+
const r = role.toLowerCase();
|
|
3595
|
+
return r === "button" || r === "link" || r === "a" || r === "menuitem" || r === "tab" || r === "checkbox" || r === "radio" || r === "switch" || r === "option" || r === "treeitem";
|
|
3596
|
+
}
|
|
3597
|
+
function isInputRole(role) {
|
|
3598
|
+
const r = role.toLowerCase();
|
|
3599
|
+
return r === "textbox" || r === "input" || r === "textarea" || r === "searchbox" || r === "spinbutton" || r === "combobox" || r === "select" || r === "checkbox" || r === "radio";
|
|
3600
|
+
}
|
|
3601
|
+
function verbCompatible(role, verb) {
|
|
3602
|
+
if (!verb || verb === "click") return isClickableRole(role) || isInputRole(role);
|
|
3603
|
+
if (verb === "fill" || verb === "type" || verb === "select") return isInputRole(role);
|
|
3604
|
+
return true;
|
|
3605
|
+
}
|
|
3606
|
+
function wholeWordContains(haystack, needle) {
|
|
3607
|
+
if (!haystack || !needle) return false;
|
|
3608
|
+
return new RegExp(`\\b${needle.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`, "i").test(haystack);
|
|
3609
|
+
}
|
|
3610
|
+
const L0 = {
|
|
3611
|
+
name: "L0",
|
|
3612
|
+
floor: .95,
|
|
3613
|
+
run: (snapshot, parsed) => {
|
|
3614
|
+
const target = parsed.quotedName ?? parsed.normTarget;
|
|
3615
|
+
if (!target) return [];
|
|
3616
|
+
const out = [];
|
|
3617
|
+
for (const el of snapshot.elements) {
|
|
3618
|
+
if (!verbCompatible(el.role, parsed.verb)) continue;
|
|
3619
|
+
const nm = nameLowerOf(el);
|
|
3620
|
+
if (!nm) continue;
|
|
3621
|
+
if (nm === target.toLowerCase()) out.push({
|
|
3622
|
+
el,
|
|
3623
|
+
score: 1,
|
|
3624
|
+
layer: "L0",
|
|
3625
|
+
reason: `L0 exact name "${el.name}"`
|
|
3626
|
+
});
|
|
3627
|
+
}
|
|
3628
|
+
return out;
|
|
3629
|
+
}
|
|
3630
|
+
};
|
|
3631
|
+
const L1 = {
|
|
3632
|
+
name: "L1",
|
|
3633
|
+
floor: .9,
|
|
3634
|
+
run: (snapshot, parsed) => {
|
|
3635
|
+
if (parsed.verb && parsed.verb !== "fill" && parsed.verb !== "type" && parsed.verb !== "select") return [];
|
|
3636
|
+
const target = parsed.fieldHint ?? parsed.normTarget;
|
|
3637
|
+
if (!target) return [];
|
|
3638
|
+
const tgt = target.toLowerCase();
|
|
3639
|
+
const out = [];
|
|
3640
|
+
for (const el of snapshot.elements) {
|
|
3641
|
+
if (!isInputRole(el.role)) continue;
|
|
3642
|
+
const nm = nameLowerOf(el);
|
|
3643
|
+
if (nm === tgt || nm === `${tgt} *` || nm === `${tgt} (required)` || nm.endsWith(tgt) && /^[\s*()required:_-]+/.test(nm.slice(0, nm.length - tgt.length))) out.push({
|
|
3644
|
+
el,
|
|
3645
|
+
score: .95,
|
|
3646
|
+
layer: "L1",
|
|
3647
|
+
reason: `L1 label "${el.name}"`
|
|
3648
|
+
});
|
|
3649
|
+
}
|
|
3650
|
+
return out;
|
|
3651
|
+
}
|
|
3652
|
+
};
|
|
3653
|
+
const L2 = {
|
|
3654
|
+
name: "L2",
|
|
3655
|
+
floor: .7,
|
|
3656
|
+
run: (snapshot, parsed) => {
|
|
3657
|
+
const target = parsed.fieldHint ?? parsed.normTarget;
|
|
3658
|
+
if (!target) return [];
|
|
3659
|
+
const tgt = target.toLowerCase();
|
|
3660
|
+
const out = [];
|
|
3661
|
+
for (const el of snapshot.elements) {
|
|
3662
|
+
if (!isInputRole(el.role)) continue;
|
|
3663
|
+
const ph = (el.placeholder ?? "").toLowerCase();
|
|
3664
|
+
if (!ph) continue;
|
|
3665
|
+
if (ph === tgt) out.push({
|
|
3666
|
+
el,
|
|
3667
|
+
score: .85,
|
|
3668
|
+
layer: "L2",
|
|
3669
|
+
reason: `L2 placeholder exact "${el.placeholder}"`
|
|
3670
|
+
});
|
|
3671
|
+
else if (wholeWordContains(ph, tgt)) out.push({
|
|
3672
|
+
el,
|
|
3673
|
+
score: .75,
|
|
3674
|
+
layer: "L2",
|
|
3675
|
+
reason: `L2 placeholder contains "${tgt}"`
|
|
3676
|
+
});
|
|
3677
|
+
}
|
|
3678
|
+
return out;
|
|
3679
|
+
}
|
|
3680
|
+
};
|
|
3681
|
+
const L3 = {
|
|
3682
|
+
name: "L3",
|
|
3683
|
+
floor: .65,
|
|
3684
|
+
run: (snapshot, parsed) => {
|
|
3685
|
+
const target = parsed.normTarget;
|
|
3686
|
+
if (!target) return [];
|
|
3687
|
+
const out = [];
|
|
3688
|
+
for (const el of snapshot.elements) {
|
|
3689
|
+
if (!verbCompatible(el.role, parsed.verb)) continue;
|
|
3690
|
+
const nm = nameOf(el);
|
|
3691
|
+
if (!nm) continue;
|
|
3692
|
+
if (!wholeWordContains(nm, target)) continue;
|
|
3693
|
+
const score = target.length / nm.length >= .8 ? .72 : .68;
|
|
3694
|
+
out.push({
|
|
3695
|
+
el,
|
|
3696
|
+
score,
|
|
3697
|
+
layer: "L3",
|
|
3698
|
+
reason: `L3 fuzzy name "${nm}"`
|
|
3699
|
+
});
|
|
3700
|
+
}
|
|
3701
|
+
return out;
|
|
3702
|
+
}
|
|
3703
|
+
};
|
|
3704
|
+
const L4 = {
|
|
3705
|
+
name: "L4",
|
|
3706
|
+
floor: .6,
|
|
3707
|
+
run: (snapshot, parsed) => {
|
|
3708
|
+
const target = parsed.normTarget;
|
|
3709
|
+
if (!target) return [];
|
|
3710
|
+
const out = [];
|
|
3711
|
+
for (const el of snapshot.elements) {
|
|
3712
|
+
if (!isClickableRole(el.role)) continue;
|
|
3713
|
+
const text = (el.value ?? "").toLowerCase().trim();
|
|
3714
|
+
if (!text) continue;
|
|
3715
|
+
const tgt = target.toLowerCase();
|
|
3716
|
+
if (text === tgt) out.push({
|
|
3717
|
+
el,
|
|
3718
|
+
score: .65,
|
|
3719
|
+
layer: "L4",
|
|
3720
|
+
reason: `L4 text exact "${el.value}"`
|
|
3721
|
+
});
|
|
3722
|
+
else if (wholeWordContains(text, tgt)) out.push({
|
|
3723
|
+
el,
|
|
3724
|
+
score: .6,
|
|
3725
|
+
layer: "L4",
|
|
3726
|
+
reason: `L4 text contains "${tgt}"`
|
|
3727
|
+
});
|
|
3728
|
+
}
|
|
3729
|
+
return out;
|
|
3730
|
+
}
|
|
3731
|
+
};
|
|
3732
|
+
const L5 = {
|
|
3733
|
+
name: "L5",
|
|
3734
|
+
floor: .85,
|
|
3735
|
+
run: (snapshot, parsed) => {
|
|
3736
|
+
const target = parsed.normTarget;
|
|
3737
|
+
if (!target) return [];
|
|
3738
|
+
if (!/^[a-z][a-z0-9_-]{2,}$/i.test(target)) return [];
|
|
3739
|
+
const norm = target.toLowerCase().replace(/[-_]/g, "");
|
|
3740
|
+
const out = [];
|
|
3741
|
+
for (const el of snapshot.elements) {
|
|
3742
|
+
const attrs = el.attrs;
|
|
3743
|
+
if (!attrs) continue;
|
|
3744
|
+
if (attrs.testid && stripSep(attrs.testid).toLowerCase() === norm) {
|
|
3745
|
+
out.push({
|
|
3746
|
+
el,
|
|
3747
|
+
score: .9,
|
|
3748
|
+
layer: "L5",
|
|
3749
|
+
reason: `L5 testid="${attrs.testid}"`
|
|
3750
|
+
});
|
|
3751
|
+
continue;
|
|
3752
|
+
}
|
|
3753
|
+
if (attrs.id && stripSep(attrs.id).toLowerCase() === norm) {
|
|
3754
|
+
out.push({
|
|
3755
|
+
el,
|
|
3756
|
+
score: .88,
|
|
3757
|
+
layer: "L5",
|
|
3758
|
+
reason: `L5 id="${attrs.id}"`
|
|
3759
|
+
});
|
|
3760
|
+
continue;
|
|
3761
|
+
}
|
|
3762
|
+
if (attrs.name_attr && stripSep(attrs.name_attr).toLowerCase() === norm) {
|
|
3763
|
+
out.push({
|
|
3764
|
+
el,
|
|
3765
|
+
score: .86,
|
|
3766
|
+
layer: "L5",
|
|
3767
|
+
reason: `L5 name="${attrs.name_attr}"`
|
|
3768
|
+
});
|
|
3769
|
+
continue;
|
|
3770
|
+
}
|
|
3771
|
+
if (attrs.aria_label && stripSep(attrs.aria_label).toLowerCase() === norm) out.push({
|
|
3772
|
+
el,
|
|
3773
|
+
score: .86,
|
|
3774
|
+
layer: "L5",
|
|
3775
|
+
reason: `L5 aria-label="${attrs.aria_label}"`
|
|
3776
|
+
});
|
|
3777
|
+
}
|
|
3778
|
+
return out;
|
|
3779
|
+
}
|
|
3780
|
+
};
|
|
3781
|
+
function stripSep(s) {
|
|
3782
|
+
return s.replace(/[-_\s]/g, "");
|
|
3783
|
+
}
|
|
3784
|
+
const LAYERS = [
|
|
3785
|
+
L0,
|
|
3786
|
+
L1,
|
|
3787
|
+
L2,
|
|
3788
|
+
L3,
|
|
3789
|
+
L4,
|
|
3790
|
+
L5,
|
|
3791
|
+
{
|
|
3792
|
+
name: "L6",
|
|
3793
|
+
floor: .75,
|
|
3794
|
+
run: (snapshot, parsed) => {
|
|
3795
|
+
if (!parsed.ordinal) return [];
|
|
3796
|
+
const { n, kind } = parsed.ordinal;
|
|
3797
|
+
const candidates = snapshot.elements.filter((el) => {
|
|
3798
|
+
if (!kind) return true;
|
|
3799
|
+
const role = el.role.toLowerCase();
|
|
3800
|
+
return role === kind || role === `${kind}s` || (el.tag ?? "").toLowerCase() === kind;
|
|
3801
|
+
});
|
|
3802
|
+
if (candidates.length < Math.abs(n)) return [];
|
|
3803
|
+
const sorted = [...candidates].sort((a, b) => {
|
|
3804
|
+
const ay = Math.floor(a.bbox[1] / 24);
|
|
3805
|
+
const by = Math.floor(b.bbox[1] / 24);
|
|
3806
|
+
if (ay !== by) return ay - by;
|
|
3807
|
+
return a.bbox[0] - b.bbox[0];
|
|
3808
|
+
});
|
|
3809
|
+
const idx = n === -1 ? sorted.length - 1 : n - 1;
|
|
3810
|
+
if (idx < 0 || idx >= sorted.length) return [];
|
|
3811
|
+
return [{
|
|
3812
|
+
el: sorted[idx],
|
|
3813
|
+
score: .8,
|
|
3814
|
+
layer: "L6",
|
|
3815
|
+
reason: `L6 ordinal pick #${n} of ${sorted.length} ${kind ?? "elements"}`
|
|
3816
|
+
}];
|
|
3817
|
+
}
|
|
3818
|
+
},
|
|
3819
|
+
{
|
|
3820
|
+
name: "L7",
|
|
3821
|
+
floor: .5,
|
|
3822
|
+
run: (snapshot, parsed) => {
|
|
3823
|
+
const hint = parsed.fieldHint ?? parsed.normTarget;
|
|
3824
|
+
if (!hint) return [];
|
|
3825
|
+
const h = hint.toLowerCase();
|
|
3826
|
+
const out = [];
|
|
3827
|
+
const inputRolePred = (el) => isInputRole(el.role);
|
|
3828
|
+
if (h === "email") {
|
|
3829
|
+
for (const el of snapshot.elements) if (el.inputType === "email" || inputRolePred(el) && (wholeWordContains(el.placeholder ?? "", "email") || wholeWordContains(el.name ?? "", "email"))) out.push({
|
|
3830
|
+
el,
|
|
3831
|
+
score: .55,
|
|
3832
|
+
layer: "L7",
|
|
3833
|
+
reason: "L7 email heuristic"
|
|
3834
|
+
});
|
|
3835
|
+
} else if (h === "password") {
|
|
3836
|
+
for (const el of snapshot.elements) if (el.inputType === "password" || inputRolePred(el) && wholeWordContains(el.name ?? "", "password")) out.push({
|
|
3837
|
+
el,
|
|
3838
|
+
score: .55,
|
|
3839
|
+
layer: "L7",
|
|
3840
|
+
reason: "L7 password heuristic"
|
|
3841
|
+
});
|
|
3842
|
+
} else if (h === "search") {
|
|
3843
|
+
for (const el of snapshot.elements) if (el.role === "searchbox" || el.inputType === "search" || inputRolePred(el) && wholeWordContains(el.name ?? "", "search")) out.push({
|
|
3844
|
+
el,
|
|
3845
|
+
score: .55,
|
|
3846
|
+
layer: "L7",
|
|
3847
|
+
reason: "L7 search heuristic"
|
|
3848
|
+
});
|
|
3849
|
+
} else if (h === "phone" || h === "tel") {
|
|
3850
|
+
for (const el of snapshot.elements) if (el.inputType === "tel" || inputRolePred(el) && wholeWordContains(el.name ?? "", "phone")) out.push({
|
|
3851
|
+
el,
|
|
3852
|
+
score: .55,
|
|
3853
|
+
layer: "L7",
|
|
3854
|
+
reason: "L7 phone heuristic"
|
|
3855
|
+
});
|
|
3856
|
+
} else if (h === "submit" || h === "sign in" || h === "signin" || h === "log in" || h === "login") {
|
|
3857
|
+
const sumRe = /^(submit|send|continue|next|save|sign[\s-]?in|sign[\s-]?up|log[\s-]?in)$/i;
|
|
3858
|
+
for (const el of snapshot.elements) if (el.role === "button" && sumRe.test(el.name ?? "")) out.push({
|
|
3859
|
+
el,
|
|
3860
|
+
score: .55,
|
|
3861
|
+
layer: "L7",
|
|
3862
|
+
reason: "L7 submit heuristic"
|
|
3863
|
+
});
|
|
3864
|
+
} else if (h === "username" || h === "user") {
|
|
3865
|
+
for (const el of snapshot.elements) if (inputRolePred(el) && (wholeWordContains(el.name ?? "", "user") || wholeWordContains(el.name ?? "", "login") || wholeWordContains(el.name ?? "", "account"))) out.push({
|
|
3866
|
+
el,
|
|
3867
|
+
score: .55,
|
|
3868
|
+
layer: "L7",
|
|
3869
|
+
reason: "L7 username heuristic"
|
|
3870
|
+
});
|
|
3871
|
+
}
|
|
3872
|
+
return out;
|
|
3873
|
+
}
|
|
3874
|
+
}
|
|
3875
|
+
];
|
|
3876
|
+
|
|
3877
|
+
//#endregion
|
|
3878
|
+
//#region src/lib/browser-mcp/parse-intent.ts
|
|
3879
|
+
const VERB_RE = /^\s*(click|press|tap|fill|enter|type|select|choose|scroll(?:[ -]?into[ -]?view)?|toggle|check|uncheck|open|focus|hover)\s+/i;
|
|
3880
|
+
const VALUE_RE = /\s+(?:with|to|=)\s+(.+?)\s*$/i;
|
|
3881
|
+
const QUOTED_RE = /["'`]([^"'`]+)["'`]/;
|
|
3882
|
+
const TITLE_CASE_RE = /\b([A-Z][\w]*(?:\s+[A-Z\d][\w]*){0,3})\b/;
|
|
3883
|
+
const ORDINAL_WORDS = {
|
|
3884
|
+
first: 1,
|
|
3885
|
+
second: 2,
|
|
3886
|
+
third: 3,
|
|
3887
|
+
fourth: 4,
|
|
3888
|
+
fifth: 5,
|
|
3889
|
+
sixth: 6,
|
|
3890
|
+
seventh: 7,
|
|
3891
|
+
eighth: 8,
|
|
3892
|
+
ninth: 9,
|
|
3893
|
+
tenth: 10,
|
|
3894
|
+
last: -1
|
|
3895
|
+
};
|
|
3896
|
+
const ORDINAL_WORD_RE = /\b(first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|last)\s+(\w+)/i;
|
|
3897
|
+
const ORDINAL_NUM_RE = /\b(\d+)(?:st|nd|rd|th)?\s+(\w+)/i;
|
|
3898
|
+
const FIELD_HINT_KINDS = [
|
|
3899
|
+
"field",
|
|
3900
|
+
"input",
|
|
3901
|
+
"textbox",
|
|
3902
|
+
"box",
|
|
3903
|
+
"search",
|
|
3904
|
+
"dropdown",
|
|
3905
|
+
"select",
|
|
3906
|
+
"menu",
|
|
3907
|
+
"button",
|
|
3908
|
+
"link",
|
|
3909
|
+
"tab",
|
|
3910
|
+
"checkbox",
|
|
3911
|
+
"radio",
|
|
3912
|
+
"switch"
|
|
3913
|
+
];
|
|
3914
|
+
const FIELD_HINT_RE = new RegExp(`\\b(\\w+)\\s+(?:${FIELD_HINT_KINDS.join("|")})\\b`, "i");
|
|
3915
|
+
const ARTICLES_RE = /\b(the|a|an|this|that)\b/gi;
|
|
3916
|
+
/**
|
|
3917
|
+
* Parse a natural-language intent into structured parts.
|
|
3918
|
+
*
|
|
3919
|
+
* Returns a fully-formed `ParsedIntent` even for unparseable inputs
|
|
3920
|
+
* (rawTarget = the trimmed intent, normTarget = its lowercased
|
|
3921
|
+
* normalization, every other field undefined). The matcher cascade
|
|
3922
|
+
* handles "I don't know what to do" by falling through layer-by-
|
|
3923
|
+
* layer until L7 or escalate; an unparseable intent simply has
|
|
3924
|
+
* less signal for the layers to key on.
|
|
3925
|
+
*/
|
|
3926
|
+
function parseIntent(intent) {
|
|
3927
|
+
let work = String(intent ?? "").trim();
|
|
3928
|
+
let verb;
|
|
3929
|
+
const verbMatch = VERB_RE.exec(work);
|
|
3930
|
+
if (verbMatch) {
|
|
3931
|
+
verb = mapVerb(verbMatch[1]);
|
|
3932
|
+
work = work.slice(verbMatch[0].length);
|
|
3933
|
+
}
|
|
3934
|
+
let valueFromIntent;
|
|
3935
|
+
const valueMatch = VALUE_RE.exec(work);
|
|
3936
|
+
if (valueMatch) {
|
|
3937
|
+
valueFromIntent = valueMatch[1].trim();
|
|
3938
|
+
work = work.slice(0, valueMatch.index).trim();
|
|
3939
|
+
}
|
|
3940
|
+
let quotedName;
|
|
3941
|
+
const quotedMatch = QUOTED_RE.exec(work);
|
|
3942
|
+
if (quotedMatch) quotedName = quotedMatch[1].trim();
|
|
3943
|
+
else {
|
|
3944
|
+
const titleMatch = TITLE_CASE_RE.exec(work);
|
|
3945
|
+
if (titleMatch) quotedName = titleMatch[1].trim();
|
|
3946
|
+
}
|
|
3947
|
+
let ordinal;
|
|
3948
|
+
const ordWordMatch = ORDINAL_WORD_RE.exec(work);
|
|
3949
|
+
if (ordWordMatch) {
|
|
3950
|
+
const n = ORDINAL_WORDS[ordWordMatch[1].toLowerCase()];
|
|
3951
|
+
if (typeof n === "number") ordinal = {
|
|
3952
|
+
n,
|
|
3953
|
+
kind: ordWordMatch[2].toLowerCase()
|
|
3954
|
+
};
|
|
3955
|
+
} else {
|
|
3956
|
+
const ordNumMatch = ORDINAL_NUM_RE.exec(work);
|
|
3957
|
+
if (ordNumMatch) ordinal = {
|
|
3958
|
+
n: Number.parseInt(ordNumMatch[1], 10),
|
|
3959
|
+
kind: ordNumMatch[2].toLowerCase()
|
|
3960
|
+
};
|
|
3961
|
+
}
|
|
3962
|
+
let fieldHint;
|
|
3963
|
+
const fieldMatch = FIELD_HINT_RE.exec(work);
|
|
3964
|
+
if (fieldMatch) fieldHint = fieldMatch[1].toLowerCase();
|
|
3965
|
+
const rawTarget = work.trim();
|
|
3966
|
+
let normTarget = rawTarget.toLowerCase().replace(ARTICLES_RE, "").replace(/\s+/g, " ").trim();
|
|
3967
|
+
for (const kind of FIELD_HINT_KINDS) {
|
|
3968
|
+
const tail = new RegExp(`\\s+${kind}$`, "i");
|
|
3969
|
+
if (tail.test(normTarget)) {
|
|
3970
|
+
normTarget = normTarget.replace(tail, "").trim();
|
|
3971
|
+
break;
|
|
3972
|
+
}
|
|
3973
|
+
}
|
|
3974
|
+
if (ordinal) normTarget = normTarget.replace(/^(\d+(?:st|nd|rd|th)?|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|last)\s+/i, "").trim();
|
|
3975
|
+
const out = {
|
|
3976
|
+
rawTarget,
|
|
3977
|
+
normTarget
|
|
3978
|
+
};
|
|
3979
|
+
if (verb) out.verb = verb;
|
|
3980
|
+
if (quotedName) out.quotedName = quotedName;
|
|
3981
|
+
if (fieldHint) out.fieldHint = fieldHint;
|
|
3982
|
+
if (ordinal) out.ordinal = ordinal;
|
|
3983
|
+
if (valueFromIntent !== void 0) out.valueFromIntent = valueFromIntent;
|
|
3984
|
+
return out;
|
|
3985
|
+
}
|
|
3986
|
+
function mapVerb(raw) {
|
|
3987
|
+
const v = raw.toLowerCase();
|
|
3988
|
+
if (v === "click" || v === "press" || v === "tap" || v === "toggle" || v === "check" || v === "uncheck" || v === "open") return "click";
|
|
3989
|
+
if (v === "fill" || v === "enter") return "fill";
|
|
3990
|
+
if (v === "type") return "type";
|
|
3991
|
+
if (v === "select" || v === "choose") return "select";
|
|
3992
|
+
if (v === "scroll" || v === "scrollintoview" || v === "scroll into view" || v === "scroll-into-view") return "scroll_into_view";
|
|
3993
|
+
if (v === "hover" || v === "focus") return void 0;
|
|
3994
|
+
}
|
|
3995
|
+
|
|
3358
3996
|
//#endregion
|
|
3359
3997
|
//#region src/lib/mcp-inflight.ts
|
|
3360
3998
|
/**
|
|
@@ -3659,6 +4297,21 @@ async function callCompressor(systemPrompt, userMessage, tool, signal) {
|
|
|
3659
4297
|
}
|
|
3660
4298
|
}
|
|
3661
4299
|
/**
|
|
4300
|
+
* Public re-export of `callCompressor` for sibling modules that need
|
|
4301
|
+
* the same forced-tool-calling pipeline (slot acquisition, fallback-
|
|
4302
|
+
* chain backend, code-fence stripping). Used by `observe.ts` to drive
|
|
4303
|
+
* the natural-language describer through the same backend the matcher
|
|
4304
|
+
* cascade escalates to, and by `decompose-planner.ts` for the
|
|
4305
|
+
* fast-model compound-step replanner.
|
|
4306
|
+
*
|
|
4307
|
+
* Kept as a thin wrapper rather than re-exporting `callCompressor`
|
|
4308
|
+
* directly so the underlying function can change signature without
|
|
4309
|
+
* breaking the public surface.
|
|
4310
|
+
*/
|
|
4311
|
+
async function callCompressorPublic(systemPrompt, userMessage, tool, signal) {
|
|
4312
|
+
return callCompressor(systemPrompt, userMessage, tool, signal);
|
|
4313
|
+
}
|
|
4314
|
+
/**
|
|
3662
4315
|
* Strip a single leading / trailing ``` (or ```json) code fence from a
|
|
3663
4316
|
* model's free-form text reply so JSON.parse works. Idempotent on
|
|
3664
4317
|
* fence-free input. Defensive against the failure mode caught in PR #55
|
|
@@ -3680,12 +4333,32 @@ function stripCodeFence(text) {
|
|
|
3680
4333
|
* whether the intent supplied a value. Single source of truth for
|
|
3681
4334
|
* element matching.
|
|
3682
4335
|
*
|
|
4336
|
+
* Phase 2 short-circuits the common case through the deterministic
|
|
4337
|
+
* matcher cascade in `./matcher.ts` — pure-sync, no LLM round-trip,
|
|
4338
|
+
* <5ms on a 200-element snapshot. Only when the cascade returns
|
|
4339
|
+
* `source: "escalate"` (0 candidates or >1 ambiguous candidates) do
|
|
4340
|
+
* we fall through to the existing fast-model `pickMatchingElements`
|
|
4341
|
+
* path. When we DO escalate, we pass the cascade's pre-filtered
|
|
4342
|
+
* top-K shortlist along so the fast model sees ~8 candidates instead
|
|
4343
|
+
* of the full 200-element snapshot — 3-5× token-cost reduction even
|
|
4344
|
+
* on misses.
|
|
4345
|
+
*
|
|
3683
4346
|
* Returns ref="" + confidence=0 when no element matches — caller
|
|
3684
4347
|
* should escalate to visual fallback (when `visualSurfaces` is
|
|
3685
4348
|
* present) or surface the miss to the lead model.
|
|
3686
4349
|
*/
|
|
3687
4350
|
async function pickElement(snapshot, intent, signal, value) {
|
|
3688
|
-
const
|
|
4351
|
+
const det = deterministicResolve(snapshot, parseIntent(intent), value);
|
|
4352
|
+
if (det.source !== "escalate" && det.ref !== "") {
|
|
4353
|
+
const out$1 = {
|
|
4354
|
+
ref: det.ref,
|
|
4355
|
+
action: det.action,
|
|
4356
|
+
confidence: det.confidence
|
|
4357
|
+
};
|
|
4358
|
+
if (det.value !== void 0) out$1.value = det.value;
|
|
4359
|
+
return out$1;
|
|
4360
|
+
}
|
|
4361
|
+
const matches = await pickMatchingElements(snapshot, intent, signal, det.candidates);
|
|
3689
4362
|
if (matches.length === 0) return {
|
|
3690
4363
|
ref: "",
|
|
3691
4364
|
action: "click",
|
|
@@ -3756,9 +4429,28 @@ const FIND_ELEMENTS_TOOL = {
|
|
|
3756
4429
|
* Return up to 5 candidate matches for an intent. Used by
|
|
3757
4430
|
* `browser_find` — the lead model gets a small ranked list rather than
|
|
3758
4431
|
* a full element dump. Empty array when nothing matches.
|
|
3759
|
-
|
|
3760
|
-
|
|
3761
|
-
|
|
4432
|
+
*
|
|
4433
|
+
* Phase 2 short-circuits via the deterministic matcher cascade when
|
|
4434
|
+
* possible. When the cascade finds a single confident match, we
|
|
4435
|
+
* synthesize a one-item `FindMatch[]` and skip the fast-model
|
|
4436
|
+
* round-trip. When the cascade's `candidates` shortlist is passed in
|
|
4437
|
+
* by `pickElement` (escalation path), we trim the snapshot to just
|
|
4438
|
+
* those refs before sending to the fast model — keeps tokens down on
|
|
4439
|
+
* misses too.
|
|
4440
|
+
*/
|
|
4441
|
+
async function pickMatchingElements(snapshot, intent, signal, shortlist) {
|
|
4442
|
+
if (!shortlist) {
|
|
4443
|
+
const det = deterministicResolve(snapshot, parseIntent(intent));
|
|
4444
|
+
if (det.source !== "escalate" && det.ref !== "") {
|
|
4445
|
+
if (snapshot.elements.find((e) => e.ref === det.ref)) return [{
|
|
4446
|
+
ref: det.ref,
|
|
4447
|
+
reason: `deterministic ${det.source}: ${det.reason}`
|
|
4448
|
+
}];
|
|
4449
|
+
}
|
|
4450
|
+
shortlist = det.candidates;
|
|
4451
|
+
}
|
|
4452
|
+
const refSet = shortlist && shortlist.length > 0 ? new Set(shortlist.map((s) => s.ref)) : void 0;
|
|
4453
|
+
const trimmed = (refSet ? snapshot.elements.filter((e) => refSet.has(e.ref)) : snapshot.elements).map((e) => ({
|
|
3762
4454
|
ref: e.ref,
|
|
3763
4455
|
role: e.role,
|
|
3764
4456
|
name: e.name
|
|
@@ -3934,6 +4626,235 @@ async function pickElementVisual(screenshotB64, contentType, intent, visualSurfa
|
|
|
3934
4626
|
};
|
|
3935
4627
|
}
|
|
3936
4628
|
|
|
4629
|
+
//#endregion
|
|
4630
|
+
//#region src/lib/browser-mcp/decompose.ts
|
|
4631
|
+
const LOGIN_RE = /^log[ -]?in (?:to .+? )?with\s+([^\s/]+)\s*\/\s*(.+?)\s*$/i;
|
|
4632
|
+
const SEARCH_CLICK_RE = /^search\s+(?:for\s+)?(.+?)\s+and\s+click\s+(?:the\s+)?first\s+result\s*$/i;
|
|
4633
|
+
const CONJUNCTION_SPLIT_RE = /\s*(?:\s+and\s+then\s+|\s+then\s+|\s*;\s*|\s*,\s+and\s+)\s*/i;
|
|
4634
|
+
/**
|
|
4635
|
+
* Decompose a natural-language intent into atomic steps.
|
|
4636
|
+
*
|
|
4637
|
+
* The fallback path returns a single-step `[{intent: rawIntent}]` —
|
|
4638
|
+
* `browser_act` behaves identically to today's single-step dispatch
|
|
4639
|
+
* when no template matches.
|
|
4640
|
+
*/
|
|
4641
|
+
function decompose(intent, value) {
|
|
4642
|
+
const raw = String(intent ?? "").trim();
|
|
4643
|
+
if (!raw) return {
|
|
4644
|
+
steps: [{
|
|
4645
|
+
intent: "",
|
|
4646
|
+
...value !== void 0 ? { value } : {}
|
|
4647
|
+
}],
|
|
4648
|
+
template: "fallback"
|
|
4649
|
+
};
|
|
4650
|
+
const loginMatch = LOGIN_RE.exec(raw);
|
|
4651
|
+
if (loginMatch) {
|
|
4652
|
+
const user = loginMatch[1].trim();
|
|
4653
|
+
const pass = loginMatch[2].trim();
|
|
4654
|
+
return {
|
|
4655
|
+
steps: [
|
|
4656
|
+
{
|
|
4657
|
+
intent: "the email or username input",
|
|
4658
|
+
value: user
|
|
4659
|
+
},
|
|
4660
|
+
{
|
|
4661
|
+
intent: "the password input",
|
|
4662
|
+
value: pass
|
|
4663
|
+
},
|
|
4664
|
+
{ intent: "the Sign in or Log in button" }
|
|
4665
|
+
],
|
|
4666
|
+
template: "login",
|
|
4667
|
+
successSummary: "logged in"
|
|
4668
|
+
};
|
|
4669
|
+
}
|
|
4670
|
+
const searchMatch = SEARCH_CLICK_RE.exec(raw);
|
|
4671
|
+
if (searchMatch) {
|
|
4672
|
+
const query = searchMatch[1].trim();
|
|
4673
|
+
return {
|
|
4674
|
+
steps: [
|
|
4675
|
+
{
|
|
4676
|
+
intent: "the search input",
|
|
4677
|
+
value: query
|
|
4678
|
+
},
|
|
4679
|
+
{ intent: "the search button or submit" },
|
|
4680
|
+
{ intent: "the first search result" }
|
|
4681
|
+
],
|
|
4682
|
+
template: "search_click",
|
|
4683
|
+
successSummary: `searched for "${query}" and opened first result`
|
|
4684
|
+
};
|
|
4685
|
+
}
|
|
4686
|
+
if (CONJUNCTION_SPLIT_RE.test(raw)) {
|
|
4687
|
+
const parts = raw.split(CONJUNCTION_SPLIT_RE).map((p) => p.trim()).filter(Boolean);
|
|
4688
|
+
if (parts.length >= 2) return {
|
|
4689
|
+
steps: parts.map((p, i) => {
|
|
4690
|
+
if (i === 0 && value !== void 0) return {
|
|
4691
|
+
intent: p,
|
|
4692
|
+
value
|
|
4693
|
+
};
|
|
4694
|
+
return { intent: p };
|
|
4695
|
+
}),
|
|
4696
|
+
template: "conjunction"
|
|
4697
|
+
};
|
|
4698
|
+
}
|
|
4699
|
+
return {
|
|
4700
|
+
steps: [{
|
|
4701
|
+
intent: raw,
|
|
4702
|
+
...value !== void 0 ? { value } : {}
|
|
4703
|
+
}],
|
|
4704
|
+
template: "fallback"
|
|
4705
|
+
};
|
|
4706
|
+
}
|
|
4707
|
+
|
|
4708
|
+
//#endregion
|
|
4709
|
+
//#region src/lib/browser-mcp/observe.ts
|
|
4710
|
+
const OBSERVE_SYSTEM = `You describe a web page for an AI assistant that cannot see the DOM.
|
|
4711
|
+
|
|
4712
|
+
Write 2-4 sentences focused on user-actionable elements (forms, buttons, links) and the page's purpose. If 'intent' is provided, focus the description on the region most relevant to that intent.
|
|
4713
|
+
|
|
4714
|
+
DO NOT mention DOM refs, selectors, bbox coordinates, or any internal identifiers. Plain prose only. Treat the reader as someone who will issue commands like "click the Sign In button" — describe what's there in terms they can act on.
|
|
4715
|
+
|
|
4716
|
+
Call the describe_page tool with your description.`;
|
|
4717
|
+
const OBSERVE_TOOL = {
|
|
4718
|
+
name: "describe_page",
|
|
4719
|
+
description: "Report the natural-language description of the page.",
|
|
4720
|
+
parameters: {
|
|
4721
|
+
type: "object",
|
|
4722
|
+
required: ["description"],
|
|
4723
|
+
additionalProperties: false,
|
|
4724
|
+
properties: { description: {
|
|
4725
|
+
type: "string",
|
|
4726
|
+
description: "2-4 sentence prose description of the visible page state."
|
|
4727
|
+
} }
|
|
4728
|
+
}
|
|
4729
|
+
};
|
|
4730
|
+
/**
|
|
4731
|
+
* Produce a natural-language description of the current page state.
|
|
4732
|
+
* The lead model never sees the underlying snapshot.
|
|
4733
|
+
*/
|
|
4734
|
+
async function observePage(snapshot, intent, signal) {
|
|
4735
|
+
const trimmedElements = snapshot.elements.filter((e) => e.name && e.name.length > 0).slice(0, 80).map((e) => ({
|
|
4736
|
+
role: e.role,
|
|
4737
|
+
name: e.name
|
|
4738
|
+
}));
|
|
4739
|
+
const raw = await callCompressorPublic(OBSERVE_SYSTEM, JSON.stringify({
|
|
4740
|
+
intent: intent ?? "",
|
|
4741
|
+
url: snapshot.url ?? "",
|
|
4742
|
+
title: snapshot.title ?? "",
|
|
4743
|
+
visible_text: (snapshot.text ?? "").slice(0, 4e3),
|
|
4744
|
+
actionable_elements: trimmedElements,
|
|
4745
|
+
has_visual_surfaces: Boolean(snapshot.visualSurfaces && snapshot.visualSurfaces.length > 0)
|
|
4746
|
+
}), OBSERVE_TOOL, signal);
|
|
4747
|
+
const out = {
|
|
4748
|
+
description: raw && typeof raw === "object" && typeof raw.description === "string" ? raw.description : "Page contents could not be described.",
|
|
4749
|
+
hasVisualSurfaces: Boolean(snapshot.visualSurfaces && snapshot.visualSurfaces.length > 0)
|
|
4750
|
+
};
|
|
4751
|
+
if (snapshot.url) out.url = snapshot.url;
|
|
4752
|
+
if (snapshot.title) out.title = snapshot.title;
|
|
4753
|
+
return out;
|
|
4754
|
+
}
|
|
4755
|
+
|
|
4756
|
+
//#endregion
|
|
4757
|
+
//#region src/lib/browser-mcp/planner.ts
|
|
4758
|
+
const PLANNER_SYSTEM = `You are a browser-automation replanner. A user issued a high-level intent that was decomposed into atomic steps. Several steps ran successfully, then one failed. You see the page state AFTER the failure and decide what to do next.
|
|
4759
|
+
|
|
4760
|
+
Your job: produce a revised list of atomic steps that will accomplish the original intent given the current page. If you cannot — the page has changed in a way that makes the intent impossible (login form vanished, navigation moved elsewhere, captcha appeared) — return an empty list and explain why in reasoning.
|
|
4761
|
+
|
|
4762
|
+
Each replanned step is a free-form natural-language intent ("the email input", "the Sign In button at the bottom of the form") plus an optional value for fill/type/select actions. Be SPECIFIC about element location ("at the bottom of the form", "in the top navigation") so the deterministic matcher cascade can resolve it without ambiguity. Do NOT reference element refs.
|
|
4763
|
+
|
|
4764
|
+
Cost rule: you get ONE call per compound failure. Make every step count.
|
|
4765
|
+
|
|
4766
|
+
Call the replan_compound tool with your answer.`;
|
|
4767
|
+
const PLANNER_TOOL = {
|
|
4768
|
+
name: "replan_compound",
|
|
4769
|
+
description: "Report the revised atomic steps to complete the original compound intent.",
|
|
4770
|
+
parameters: {
|
|
4771
|
+
type: "object",
|
|
4772
|
+
required: ["steps", "reasoning"],
|
|
4773
|
+
additionalProperties: false,
|
|
4774
|
+
properties: {
|
|
4775
|
+
steps: {
|
|
4776
|
+
type: "array",
|
|
4777
|
+
maxItems: 8,
|
|
4778
|
+
items: {
|
|
4779
|
+
type: "object",
|
|
4780
|
+
required: ["intent"],
|
|
4781
|
+
additionalProperties: false,
|
|
4782
|
+
properties: {
|
|
4783
|
+
intent: { type: "string" },
|
|
4784
|
+
value: { type: "string" }
|
|
4785
|
+
}
|
|
4786
|
+
}
|
|
4787
|
+
},
|
|
4788
|
+
reasoning: {
|
|
4789
|
+
type: "string",
|
|
4790
|
+
description: "1-2 sentence explanation of the replanning decision."
|
|
4791
|
+
}
|
|
4792
|
+
}
|
|
4793
|
+
}
|
|
4794
|
+
};
|
|
4795
|
+
/**
|
|
4796
|
+
* Run the fast-model planner on a failed compound. Returns the
|
|
4797
|
+
* revised step list (may be empty if the planner gives up).
|
|
4798
|
+
*
|
|
4799
|
+
* The snapshot is trimmed before sending to keep the round-trip
|
|
4800
|
+
* small: only element role + name + brief value/placeholder if
|
|
4801
|
+
* present. Bbox / state flags / frame ids would just inflate tokens
|
|
4802
|
+
* without helping the natural-language replanner.
|
|
4803
|
+
*/
|
|
4804
|
+
async function planCompoundReplan(input, signal) {
|
|
4805
|
+
const trimmed = input.snapshot.elements.slice(0, 80).map((e) => {
|
|
4806
|
+
const out = { role: e.role };
|
|
4807
|
+
if (e.name) out.name = e.name;
|
|
4808
|
+
if (e.placeholder) out.placeholder = e.placeholder;
|
|
4809
|
+
if (e.value) out.value = e.value;
|
|
4810
|
+
return out;
|
|
4811
|
+
});
|
|
4812
|
+
const raw = await callCompressorPublic(PLANNER_SYSTEM, JSON.stringify({
|
|
4813
|
+
original_intent: input.originalIntent,
|
|
4814
|
+
original_value: input.originalValue,
|
|
4815
|
+
completed_steps: input.completedSteps.map((s) => ({
|
|
4816
|
+
intent: s.intent,
|
|
4817
|
+
...s.value !== void 0 ? { value: s.value } : {}
|
|
4818
|
+
})),
|
|
4819
|
+
failed_step: {
|
|
4820
|
+
intent: input.failedStep.intent,
|
|
4821
|
+
...input.failedStep.value !== void 0 ? { value: input.failedStep.value } : {}
|
|
4822
|
+
},
|
|
4823
|
+
failure_reason: input.failureReason,
|
|
4824
|
+
page_now: {
|
|
4825
|
+
url: input.snapshot.url ?? "",
|
|
4826
|
+
title: input.snapshot.title ?? "",
|
|
4827
|
+
visible_text: (input.snapshot.text ?? "").slice(0, 3e3),
|
|
4828
|
+
actionable_elements: trimmed
|
|
4829
|
+
}
|
|
4830
|
+
}), PLANNER_TOOL, signal);
|
|
4831
|
+
if (!raw || typeof raw !== "object") return {
|
|
4832
|
+
steps: [],
|
|
4833
|
+
reasoning: "planner returned empty response"
|
|
4834
|
+
};
|
|
4835
|
+
const obj = raw;
|
|
4836
|
+
const reasoning = typeof obj.reasoning === "string" ? obj.reasoning : "";
|
|
4837
|
+
if (!Array.isArray(obj.steps)) return {
|
|
4838
|
+
steps: [],
|
|
4839
|
+
reasoning
|
|
4840
|
+
};
|
|
4841
|
+
const steps = [];
|
|
4842
|
+
for (const s of obj.steps.slice(0, 8)) {
|
|
4843
|
+
if (!s || typeof s !== "object") continue;
|
|
4844
|
+
const intent = s.intent;
|
|
4845
|
+
const value = s.value;
|
|
4846
|
+
if (typeof intent === "string" && intent.length > 0) {
|
|
4847
|
+
const step = { intent };
|
|
4848
|
+
if (typeof value === "string") step.value = value;
|
|
4849
|
+
steps.push(step);
|
|
4850
|
+
}
|
|
4851
|
+
}
|
|
4852
|
+
return {
|
|
4853
|
+
steps,
|
|
4854
|
+
reasoning
|
|
4855
|
+
};
|
|
4856
|
+
}
|
|
4857
|
+
|
|
3937
4858
|
//#endregion
|
|
3938
4859
|
//#region src/lib/browser-mcp/index.ts
|
|
3939
4860
|
/**
|
|
@@ -3990,7 +4911,7 @@ const BROWSER_TOOLS = Object.freeze([
|
|
|
3990
4911
|
additionalProperties: false,
|
|
3991
4912
|
properties: {}
|
|
3992
4913
|
},
|
|
3993
|
-
capability: "
|
|
4914
|
+
capability: "browser_power",
|
|
3994
4915
|
async handler(args, signal) {
|
|
3995
4916
|
return dispatchBrowserTool("browser_list_tabs", args, signal);
|
|
3996
4917
|
}
|
|
@@ -4031,7 +4952,7 @@ const BROWSER_TOOLS = Object.freeze([
|
|
|
4031
4952
|
description: "Array of tab ids to close (from browser_list_tabs)."
|
|
4032
4953
|
} }
|
|
4033
4954
|
},
|
|
4034
|
-
capability: "
|
|
4955
|
+
capability: "browser_power",
|
|
4035
4956
|
async handler(args, signal) {
|
|
4036
4957
|
return dispatchBrowserTool("browser_close_tab", args, signal);
|
|
4037
4958
|
}
|
|
@@ -4116,7 +5037,7 @@ const BROWSER_TOOLS = Object.freeze([
|
|
|
4116
5037
|
}
|
|
4117
5038
|
}
|
|
4118
5039
|
},
|
|
4119
|
-
capability: "
|
|
5040
|
+
capability: "browser_power",
|
|
4120
5041
|
async handler(args, signal) {
|
|
4121
5042
|
return dispatchBrowserTool("browser_read_page", args, signal);
|
|
4122
5043
|
}
|
|
@@ -4175,7 +5096,7 @@ const BROWSER_TOOLS = Object.freeze([
|
|
|
4175
5096
|
}
|
|
4176
5097
|
}
|
|
4177
5098
|
},
|
|
4178
|
-
capability: "
|
|
5099
|
+
capability: "browser_power",
|
|
4179
5100
|
async handler(args, signal) {
|
|
4180
5101
|
return dispatchBrowserTool("browser_scroll", args, signal);
|
|
4181
5102
|
}
|
|
@@ -4195,7 +5116,7 @@ const BROWSER_TOOLS = Object.freeze([
|
|
|
4195
5116
|
}
|
|
4196
5117
|
}
|
|
4197
5118
|
},
|
|
4198
|
-
capability: "
|
|
5119
|
+
capability: "browser_power",
|
|
4199
5120
|
async handler(args, signal) {
|
|
4200
5121
|
return dispatchBrowserTool("browser_keyboard", args, signal);
|
|
4201
5122
|
}
|
|
@@ -4232,7 +5153,7 @@ const BROWSER_TOOLS = Object.freeze([
|
|
|
4232
5153
|
}
|
|
4233
5154
|
}
|
|
4234
5155
|
},
|
|
4235
|
-
capability: "
|
|
5156
|
+
capability: "browser_power",
|
|
4236
5157
|
async handler(args, signal) {
|
|
4237
5158
|
return dispatchBrowserTool("browser_wait", args, signal);
|
|
4238
5159
|
}
|
|
@@ -4256,7 +5177,7 @@ const BROWSER_TOOLS = Object.freeze([
|
|
|
4256
5177
|
}
|
|
4257
5178
|
}
|
|
4258
5179
|
},
|
|
4259
|
-
capability: "
|
|
5180
|
+
capability: "browser_power",
|
|
4260
5181
|
async handler(args, signal) {
|
|
4261
5182
|
return dispatchBrowserTool("browser_eval_js", args, signal);
|
|
4262
5183
|
}
|
|
@@ -4288,7 +5209,7 @@ const BROWSER_TOOLS = Object.freeze([
|
|
|
4288
5209
|
}
|
|
4289
5210
|
}
|
|
4290
5211
|
},
|
|
4291
|
-
capability: "
|
|
5212
|
+
capability: "browser_power",
|
|
4292
5213
|
async handler(args, signal) {
|
|
4293
5214
|
return dispatchBrowserTool("browser_download", args, signal);
|
|
4294
5215
|
}
|
|
@@ -4352,7 +5273,7 @@ const BROWSER_TOOLS = Object.freeze([
|
|
|
4352
5273
|
}
|
|
4353
5274
|
}
|
|
4354
5275
|
},
|
|
4355
|
-
capability: "
|
|
5276
|
+
capability: "browser_power",
|
|
4356
5277
|
async handler(args, signal) {
|
|
4357
5278
|
return dispatchBrowserTool("browser_mouse", args, signal);
|
|
4358
5279
|
}
|
|
@@ -4426,7 +5347,7 @@ const BROWSER_TOOLS = Object.freeze([
|
|
|
4426
5347
|
}
|
|
4427
5348
|
}
|
|
4428
5349
|
},
|
|
4429
|
-
capability: "
|
|
5350
|
+
capability: "browser_power",
|
|
4430
5351
|
async handler(args, signal) {
|
|
4431
5352
|
return dispatchBrowserTool("browser_drag", args, signal);
|
|
4432
5353
|
}
|
|
@@ -4450,7 +5371,7 @@ const BROWSER_TOOLS = Object.freeze([
|
|
|
4450
5371
|
}
|
|
4451
5372
|
}
|
|
4452
5373
|
},
|
|
4453
|
-
capability: "
|
|
5374
|
+
capability: "browser_power",
|
|
4454
5375
|
async handler(args, signal) {
|
|
4455
5376
|
return dispatchBrowserTool("browser_type", args, signal);
|
|
4456
5377
|
}
|
|
@@ -4491,7 +5412,7 @@ const BROWSER_TOOLS = Object.freeze([
|
|
|
4491
5412
|
}
|
|
4492
5413
|
}
|
|
4493
5414
|
},
|
|
4494
|
-
capability: "
|
|
5415
|
+
capability: "browser_power",
|
|
4495
5416
|
async handler(args, signal) {
|
|
4496
5417
|
const kind = args.kind === "network" ? "network" : "console";
|
|
4497
5418
|
const tool = kind === "network" ? "browser_network_log" : "browser_console_logs";
|
|
@@ -4548,7 +5469,7 @@ const BROWSER_TOOLS = Object.freeze([
|
|
|
4548
5469
|
}
|
|
4549
5470
|
}
|
|
4550
5471
|
},
|
|
4551
|
-
capability: "
|
|
5472
|
+
capability: "browser_power",
|
|
4552
5473
|
async handler(args, signal) {
|
|
4553
5474
|
const tabId = typeof args.tabId === "number" ? args.tabId : void 0;
|
|
4554
5475
|
const intent = typeof args.intent === "string" ? args.intent : "";
|
|
@@ -4615,65 +5536,109 @@ const BROWSER_TOOLS = Object.freeze([
|
|
|
4615
5536
|
const value = typeof args.value === "string" ? args.value : void 0;
|
|
4616
5537
|
if (!refIn && !intent) return toolEnvelope({ error: "either `ref` (REF mode) or `intent` (INTENT mode) is required" }, true);
|
|
4617
5538
|
if (refIn) return dispatchActionByRef(tabId, refIn, typeof args.action === "string" ? args.action : "click", value, signal);
|
|
4618
|
-
const
|
|
4619
|
-
|
|
4620
|
-
|
|
4621
|
-
|
|
4622
|
-
|
|
4623
|
-
|
|
4624
|
-
|
|
4625
|
-
|
|
5539
|
+
const decomposed = decompose(intent, value);
|
|
5540
|
+
if (decomposed.steps.length === 1) return runAtomicIntentStep(tabId, decomposed.steps[0].intent, decomposed.steps[0].value, signal);
|
|
5541
|
+
const summaries = [];
|
|
5542
|
+
let navigated = false;
|
|
5543
|
+
const completedSteps = [];
|
|
5544
|
+
for (let i = 0; i < decomposed.steps.length; i++) {
|
|
5545
|
+
const step = decomposed.steps[i];
|
|
5546
|
+
const env = await runAtomicIntentStep(tabId, step.intent, step.value, signal);
|
|
5547
|
+
const stepText = env.content?.[0]?.text;
|
|
5548
|
+
let stepResult = {};
|
|
5549
|
+
if (typeof stepText === "string") try {
|
|
5550
|
+
stepResult = JSON.parse(stepText);
|
|
5551
|
+
} catch {}
|
|
5552
|
+
if (env.isError || stepResult.ok === false) try {
|
|
5553
|
+
const failureReason = String(stepResult.error ?? "unknown");
|
|
5554
|
+
const replan = await planCompoundReplan({
|
|
5555
|
+
originalIntent: intent,
|
|
5556
|
+
originalValue: value,
|
|
5557
|
+
completedSteps,
|
|
5558
|
+
failedStep: step,
|
|
5559
|
+
failureReason,
|
|
5560
|
+
snapshot: await fetchSnapshot(tabId, signal)
|
|
4626
5561
|
}, signal);
|
|
4627
|
-
if (
|
|
5562
|
+
if (replan.steps.length === 0) return toolEnvelope({
|
|
4628
5563
|
ok: false,
|
|
4629
|
-
|
|
4630
|
-
|
|
5564
|
+
summary: `compound step ${i + 1}/${decomposed.steps.length} failed and planner declined: ${replan.reasoning || failureReason}`,
|
|
5565
|
+
template: decomposed.template,
|
|
5566
|
+
steps_completed: i,
|
|
5567
|
+
failed_step: step.intent,
|
|
5568
|
+
planner_reasoning: replan.reasoning
|
|
4631
5569
|
}, true);
|
|
4632
|
-
const
|
|
4633
|
-
let
|
|
4634
|
-
|
|
4635
|
-
|
|
4636
|
-
|
|
4637
|
-
|
|
5570
|
+
const replanSummaries = [];
|
|
5571
|
+
for (let j = 0; j < replan.steps.length; j++) {
|
|
5572
|
+
const rstep = replan.steps[j];
|
|
5573
|
+
const renv = await runAtomicIntentStep(tabId, rstep.intent, rstep.value, signal);
|
|
5574
|
+
const rtext = renv.content?.[0]?.text;
|
|
5575
|
+
let rresult = {};
|
|
5576
|
+
if (typeof rtext === "string") try {
|
|
5577
|
+
rresult = JSON.parse(rtext);
|
|
5578
|
+
} catch {}
|
|
5579
|
+
if (renv.isError || rresult.ok === false) return toolEnvelope({
|
|
4638
5580
|
ok: false,
|
|
4639
|
-
|
|
5581
|
+
summary: `compound failed at original step ${i + 1}, planner replan also failed at step ${j + 1}/${replan.steps.length}: ${String(rresult.error ?? "unknown")}`,
|
|
5582
|
+
template: decomposed.template,
|
|
5583
|
+
steps_completed: i,
|
|
5584
|
+
failed_step: rstep.intent,
|
|
5585
|
+
planner_reasoning: replan.reasoning
|
|
4640
5586
|
}, true);
|
|
5587
|
+
if (typeof rresult.action_taken === "string") replanSummaries.push(`${rresult.action_taken} (${rstep.intent})`);
|
|
5588
|
+
if (rresult.navigated === true) navigated = true;
|
|
4641
5589
|
}
|
|
4642
|
-
if (!shot.contentType || !shot.dataBase64) return toolEnvelope({
|
|
4643
|
-
ok: false,
|
|
4644
|
-
error: "no text match; screenshot envelope missing fields"
|
|
4645
|
-
}, true);
|
|
4646
|
-
const visual = await pickElementVisual(shot.dataBase64, shot.contentType, intent, surfaces, signal);
|
|
4647
|
-
if (visual.confidence < .5) return toolEnvelope({
|
|
4648
|
-
ok: false,
|
|
4649
|
-
error: "no element matched intent (text + visual)",
|
|
4650
|
-
picked,
|
|
4651
|
-
visual
|
|
4652
|
-
}, true);
|
|
4653
|
-
const clickEnv = await dispatchBrowserTool("browser_mouse", {
|
|
4654
|
-
tabId,
|
|
4655
|
-
action: "click",
|
|
4656
|
-
x: visual.x,
|
|
4657
|
-
y: visual.y,
|
|
4658
|
-
force: true
|
|
4659
|
-
}, signal);
|
|
4660
|
-
if (clickEnv.isError) return clickEnv;
|
|
4661
5590
|
return toolEnvelope({
|
|
4662
5591
|
ok: true,
|
|
4663
|
-
|
|
4664
|
-
|
|
4665
|
-
|
|
4666
|
-
|
|
4667
|
-
|
|
5592
|
+
summary: `compound recovered via planner (${replan.reasoning}): ${replanSummaries.join(" → ")}`,
|
|
5593
|
+
template: decomposed.template,
|
|
5594
|
+
steps_completed: i + replan.steps.length,
|
|
5595
|
+
navigated,
|
|
5596
|
+
planner_used: true,
|
|
5597
|
+
planner_reasoning: replan.reasoning
|
|
4668
5598
|
});
|
|
5599
|
+
} catch (replanErr) {
|
|
5600
|
+
return toolEnvelope({
|
|
5601
|
+
ok: false,
|
|
5602
|
+
summary: `compound step ${i + 1}/${decomposed.steps.length} failed; planner errored: ${replanErr instanceof Error ? replanErr.message : String(replanErr)}`,
|
|
5603
|
+
template: decomposed.template,
|
|
5604
|
+
steps_completed: i,
|
|
5605
|
+
failed_step: step.intent
|
|
5606
|
+
}, true);
|
|
5607
|
+
}
|
|
5608
|
+
if (typeof stepResult.action_taken === "string") summaries.push(`${stepResult.action_taken} (${step.intent})`);
|
|
5609
|
+
if (stepResult.navigated === true) navigated = true;
|
|
5610
|
+
completedSteps.push(step);
|
|
5611
|
+
}
|
|
5612
|
+
return toolEnvelope({
|
|
5613
|
+
ok: true,
|
|
5614
|
+
summary: decomposed.successSummary ?? summaries.join(" → "),
|
|
5615
|
+
template: decomposed.template,
|
|
5616
|
+
steps_completed: decomposed.steps.length,
|
|
5617
|
+
navigated
|
|
5618
|
+
});
|
|
5619
|
+
}
|
|
5620
|
+
},
|
|
5621
|
+
{
|
|
5622
|
+
toolNameHttp: "browser_observe",
|
|
5623
|
+
description: "Get a natural-language description of the current page's user-actionable state — what forms, buttons, links, and content sections are visible — in 2-4 sentences. Optional `intent` focuses the description on a region ('describe the login form', 'what's in the comments section'). Use this BEFORE browser_act when you don't know what's on the page, or AFTER navigation to confirm the page loaded. Cheaper than screenshots when text is enough. Does not include canvas/SVG content — those surface as a `hasVisualSurfaces` flag; switch to browser_screenshot for visuals.",
|
|
5624
|
+
inputSchema: {
|
|
5625
|
+
type: "object",
|
|
5626
|
+
required: ["tabId"],
|
|
5627
|
+
additionalProperties: false,
|
|
5628
|
+
properties: {
|
|
5629
|
+
tabId: { type: "number" },
|
|
5630
|
+
intent: {
|
|
5631
|
+
type: "string",
|
|
5632
|
+
description: "Optional natural-language focus ('describe the form', 'what's in the sidebar')."
|
|
4669
5633
|
}
|
|
4670
|
-
return toolEnvelope({
|
|
4671
|
-
ok: false,
|
|
4672
|
-
error: "no element matched intent",
|
|
4673
|
-
picked
|
|
4674
|
-
}, true);
|
|
4675
5634
|
}
|
|
4676
|
-
|
|
5635
|
+
},
|
|
5636
|
+
capability: "browser_compound",
|
|
5637
|
+
async handler(args, signal) {
|
|
5638
|
+
const tabId = typeof args.tabId === "number" ? args.tabId : void 0;
|
|
5639
|
+
const intent = typeof args.intent === "string" ? args.intent : void 0;
|
|
5640
|
+
if (!tabId) return toolEnvelope({ error: "tabId required" }, true);
|
|
5641
|
+
return toolEnvelope(await observePage(await fetchSnapshot(tabId, signal), intent, signal));
|
|
4677
5642
|
}
|
|
4678
5643
|
},
|
|
4679
5644
|
{
|
|
@@ -4716,6 +5681,76 @@ const BROWSER_TOOLS = Object.freeze([
|
|
|
4716
5681
|
}
|
|
4717
5682
|
]);
|
|
4718
5683
|
/**
|
|
5684
|
+
* Run a single atomic intent step: fetch snapshot, run matcher
|
|
5685
|
+
* cascade (via pickElement), visual fallback on no-match, dispatch
|
|
5686
|
+
* the resolved action. Returns the standard MCP envelope.
|
|
5687
|
+
*
|
|
5688
|
+
* Pulled out of `browser_act`'s handler so the compound-intent loop
|
|
5689
|
+
* (decompose path) can call it per-step without duplicating the
|
|
5690
|
+
* snapshot + visual-fallback logic.
|
|
5691
|
+
*/
|
|
5692
|
+
async function runAtomicIntentStep(tabId, intent, value, signal) {
|
|
5693
|
+
const snapshot = await fetchSnapshot(tabId, signal);
|
|
5694
|
+
const picked = await pickElement(snapshot, intent, signal, value);
|
|
5695
|
+
if (!picked.ref || picked.confidence < .5) {
|
|
5696
|
+
const surfaces = snapshot.visualSurfaces;
|
|
5697
|
+
if (surfaces && surfaces.length > 0) {
|
|
5698
|
+
const shotEnv = await dispatchBrowserTool("browser_screenshot", {
|
|
5699
|
+
tabId,
|
|
5700
|
+
format: "png"
|
|
5701
|
+
}, signal);
|
|
5702
|
+
if (shotEnv.isError) return toolEnvelope({
|
|
5703
|
+
ok: false,
|
|
5704
|
+
error: "no text match; screenshot for visual fallback failed",
|
|
5705
|
+
picked
|
|
5706
|
+
}, true);
|
|
5707
|
+
const shotText = shotEnv.content?.[0]?.text;
|
|
5708
|
+
let shot = {};
|
|
5709
|
+
try {
|
|
5710
|
+
shot = shotText ? JSON.parse(shotText) : {};
|
|
5711
|
+
} catch {
|
|
5712
|
+
return toolEnvelope({
|
|
5713
|
+
ok: false,
|
|
5714
|
+
error: "no text match; screenshot envelope unparseable"
|
|
5715
|
+
}, true);
|
|
5716
|
+
}
|
|
5717
|
+
if (!shot.contentType || !shot.dataBase64) return toolEnvelope({
|
|
5718
|
+
ok: false,
|
|
5719
|
+
error: "no text match; screenshot envelope missing fields"
|
|
5720
|
+
}, true);
|
|
5721
|
+
const visual = await pickElementVisual(shot.dataBase64, shot.contentType, intent, surfaces, signal);
|
|
5722
|
+
if (visual.confidence < .5) return toolEnvelope({
|
|
5723
|
+
ok: false,
|
|
5724
|
+
error: "no element matched intent (text + visual)",
|
|
5725
|
+
picked,
|
|
5726
|
+
visual
|
|
5727
|
+
}, true);
|
|
5728
|
+
const clickEnv = await dispatchBrowserTool("browser_mouse", {
|
|
5729
|
+
tabId,
|
|
5730
|
+
action: "click",
|
|
5731
|
+
x: visual.x,
|
|
5732
|
+
y: visual.y,
|
|
5733
|
+
force: true
|
|
5734
|
+
}, signal);
|
|
5735
|
+
if (clickEnv.isError) return clickEnv;
|
|
5736
|
+
return toolEnvelope({
|
|
5737
|
+
ok: true,
|
|
5738
|
+
action_taken: "click_visual",
|
|
5739
|
+
x: visual.x,
|
|
5740
|
+
y: visual.y,
|
|
5741
|
+
confidence: visual.confidence,
|
|
5742
|
+
reason: visual.reason
|
|
5743
|
+
});
|
|
5744
|
+
}
|
|
5745
|
+
return toolEnvelope({
|
|
5746
|
+
ok: false,
|
|
5747
|
+
error: "no element matched intent",
|
|
5748
|
+
picked
|
|
5749
|
+
}, true);
|
|
5750
|
+
}
|
|
5751
|
+
return dispatchActionByRef(tabId, picked.ref, picked.action, picked.value ?? value, signal);
|
|
5752
|
+
}
|
|
5753
|
+
/**
|
|
4719
5754
|
* Dispatch an action against a known ref via the appropriate primitive.
|
|
4720
5755
|
* Shared between REF mode and INTENT-mode-text-match in `browser_act`.
|
|
4721
5756
|
* Returns an MCP envelope (text content + optional isError).
|
|
@@ -7309,6 +8344,27 @@ function workerToolsEnabled() {
|
|
|
7309
8344
|
function browserCompoundToolsEnabled() {
|
|
7310
8345
|
return compressorAvailable();
|
|
7311
8346
|
}
|
|
8347
|
+
/**
|
|
8348
|
+
* Gate for the L0/L1 power browser tools (`browser_read_page`,
|
|
8349
|
+
* `browser_mouse`, `browser_drag`, `browser_type`, `browser_keyboard`,
|
|
8350
|
+
* `browser_scroll`, `browser_eval_js`, `browser_diagnostics`,
|
|
8351
|
+
* `browser_find`, `browser_close_tab`, `browser_list_tabs`,
|
|
8352
|
+
* `browser_wait`, `browser_download`).
|
|
8353
|
+
*
|
|
8354
|
+
* Returns true iff `state.powerBrowseEnabled` (set by `--power-browse`
|
|
8355
|
+
* or `GH_ROUTER_ENABLE_POWER_BROWSE=1`). When off, the default
|
|
8356
|
+
* `--browse` surface exposes only the 6 lead-model tools (`act`,
|
|
8357
|
+
* `observe`, `extract`, `navigate`, `screenshot`, `open_tab`) that
|
|
8358
|
+
* hide DOM details behind intent. Power mode adds the raw primitives
|
|
8359
|
+
* for users who want direct coord/keystroke control.
|
|
8360
|
+
*
|
|
8361
|
+
* `handler.ts` filter chain ANDs this with `browserToolsEnabled()`
|
|
8362
|
+
* (defense-in-depth — power without basic is meaningless and the
|
|
8363
|
+
* setup path already forces basic on when power is on).
|
|
8364
|
+
*/
|
|
8365
|
+
function browserPowerToolsEnabled() {
|
|
8366
|
+
return state.powerBrowseEnabled === true;
|
|
8367
|
+
}
|
|
7312
8368
|
|
|
7313
8369
|
//#endregion
|
|
7314
8370
|
//#region src/routes/mcp/handler.ts
|
|
@@ -7486,6 +8542,7 @@ function toolEntries() {
|
|
|
7486
8542
|
if (t.capability === "stand_in") return standInToolEnabled();
|
|
7487
8543
|
if (t.capability === "browser") return browserToolsEnabled();
|
|
7488
8544
|
if (t.capability === "browser_compound") return browserToolsEnabled() && browserCompoundToolsEnabled();
|
|
8545
|
+
if (t.capability === "browser_power") return browserToolsEnabled() && browserPowerToolsEnabled();
|
|
7489
8546
|
return true;
|
|
7490
8547
|
}).map((t) => ({
|
|
7491
8548
|
name: t.toolNameHttp,
|
|
@@ -7778,6 +8835,7 @@ async function handleToolsCall(body) {
|
|
|
7778
8835
|
if (nonPersonaTool && nonPersonaTool.capability === "stand_in" && !standInToolEnabled()) return rpcError(body.id, RPC_METHOD_NOT_FOUND, `tools/call: unknown tool "${name$1}"`);
|
|
7779
8836
|
if (nonPersonaTool && nonPersonaTool.capability === "browser" && !browserToolsEnabled()) return rpcError(body.id, RPC_METHOD_NOT_FOUND, `tools/call: unknown tool "${name$1}"`);
|
|
7780
8837
|
if (nonPersonaTool && nonPersonaTool.capability === "browser_compound" && !(browserToolsEnabled() && browserCompoundToolsEnabled())) return rpcError(body.id, RPC_METHOD_NOT_FOUND, `tools/call: unknown tool "${name$1}"`);
|
|
8838
|
+
if (nonPersonaTool && nonPersonaTool.capability === "browser_power" && !(browserToolsEnabled() && browserPowerToolsEnabled())) return rpcError(body.id, RPC_METHOD_NOT_FOUND, `tools/call: unknown tool "${name$1}"`);
|
|
7781
8839
|
let personaPrompt;
|
|
7782
8840
|
let personaContext;
|
|
7783
8841
|
let personaEffort;
|
|
@@ -11331,7 +12389,10 @@ function buildPeerAwarenessSnippet(opts) {
|
|
|
11331
12389
|
if (opts.workerToolsAvailable) para2Parts.push("`worker_explore` runs a Gemini-backed read-only worker that returns a summary, using its own context rather than yours; concurrent launches share the `MAX_INFLIGHT_TOOLS_CALL=8` cap with operator traffic.", "`worker_implement` is the same worker with edit/write/bash; `worktree: true` runs it in an isolated git worktree and returns the diff.", "Workers themselves have `code_search` in their toolset.");
|
|
11332
12390
|
para2Parts.push("`web_search` surfaces citable sources for docs, errors, and upstream issues.");
|
|
11333
12391
|
if (opts.standInAvailable) para2Parts.push("`stand_in` provides three-lab consensus for decision tiebreak when the user is unavailable.");
|
|
11334
|
-
if (opts.browseAvailable)
|
|
12392
|
+
if (opts.browseAvailable) {
|
|
12393
|
+
const powerNote = opts.powerBrowseAvailable ? " Power mode is on: the L0/L1 primitives (`browser_mouse`, `browser_drag`, `browser_type`, `browser_keyboard`, `browser_scroll`, `browser_eval_js`, `browser_read_page`, `browser_diagnostics`, `browser_find`) are also available for direct DOM / coordinate control." : "";
|
|
12394
|
+
para2Parts.push(`\`browser_*\` tools (under \`mcp__gh-router-peers__browser_*\`) drive a real Chrome / Edge browser via a local extension. Lead surface: \`browser_act(intent, value?)\` for any click / fill / type / scroll-to (an inner fast model resolves intent), \`browser_observe(intent?)\` for a 2-4 sentence natural-language page description, \`browser_extract(schema, instruction)\` for typed extraction, \`browser_navigate\` / \`browser_open_tab\` / \`browser_screenshot\` for state and visuals. The lead model never sees raw DOM: refs, bboxes, and role/name dumps stay internal.${powerNote}`);
|
|
12395
|
+
}
|
|
11335
12396
|
return [
|
|
11336
12397
|
"## Peer review and advisor",
|
|
11337
12398
|
"",
|
|
@@ -12794,7 +13855,7 @@ function initProxyFromEnv() {
|
|
|
12794
13855
|
//#endregion
|
|
12795
13856
|
//#region package.json
|
|
12796
13857
|
var name = "github-router";
|
|
12797
|
-
var version$1 = "0.3.
|
|
13858
|
+
var version$1 = "0.3.66";
|
|
12798
13859
|
|
|
12799
13860
|
//#endregion
|
|
12800
13861
|
//#region src/lib/approval.ts
|
|
@@ -14516,6 +15577,11 @@ async function setupAndServe(options) {
|
|
|
14516
15577
|
state.showToken = options.showToken;
|
|
14517
15578
|
state.extendedBetas = options.extendedBetas;
|
|
14518
15579
|
state.browseEnabled = options.browseEnabled || process.env.GH_ROUTER_ENABLE_BROWSE === "1";
|
|
15580
|
+
state.powerBrowseEnabled = options.powerBrowseEnabled || process.env.GH_ROUTER_ENABLE_POWER_BROWSE === "1";
|
|
15581
|
+
if (state.powerBrowseEnabled) state.browseEnabled = true;
|
|
15582
|
+
if (process.env.GH_ROUTER_BROWSER_NO_HUMANLIKE === "1") state.humanlikeForce = "off";
|
|
15583
|
+
else if (options.humanlikeEnabled || process.env.GH_ROUTER_HUMANLIKE === "1") state.humanlikeForce = "on";
|
|
15584
|
+
else state.humanlikeForce = "auto";
|
|
14519
15585
|
if (process.env.COPILOT_API_URL) state.copilotApiUrl = process.env.COPILOT_API_URL;
|
|
14520
15586
|
await ensurePaths();
|
|
14521
15587
|
await cacheVSCodeVersion();
|
|
@@ -14623,6 +15689,16 @@ const sharedServerArgs = {
|
|
|
14623
15689
|
type: "boolean",
|
|
14624
15690
|
default: false,
|
|
14625
15691
|
description: "Enable the browser-control MCP tools (browser_open_tab, browser_screenshot, browser_click, etc.) on /mcp. Requires Chrome or Edge installed; the bundled extension must be loaded on first tool call (the proxy returns install_required with Web Store URLs + a Load Unpacked fallback path). Off by default; can also be enabled with GH_ROUTER_ENABLE_BROWSE=1."
|
|
15692
|
+
},
|
|
15693
|
+
"power-browse": {
|
|
15694
|
+
type: "boolean",
|
|
15695
|
+
default: false,
|
|
15696
|
+
description: "Expose the full ~18-tool browser MCP surface (raw read_page, mouse / drag / scroll / keyboard / type primitives, eval_js, diagnostics, find, locate). Default --browse exposes only the 6 lead-model tools (act, observe, extract, navigate, screenshot, open_tab) that hide DOM details behind intent. Implies --browse. Off by default; can also be enabled with GH_ROUTER_ENABLE_POWER_BROWSE=1."
|
|
15697
|
+
},
|
|
15698
|
+
humanlike: {
|
|
15699
|
+
type: "boolean",
|
|
15700
|
+
default: false,
|
|
15701
|
+
description: "Force humanlike pacing on ALL browser tool dispatches: Beta-distributed inter-action delays (800-4600 ms), Bezier mouse trajectories with overshoot-and-correct, per-keystroke jitter with word-end pauses, scroll chunking. Use for known anti-bot sites (Cloudflare, Datadome). Off by default (auto mode); GH_ROUTER_HUMANLIKE=1 is the env equivalent. GH_ROUTER_BROWSER_NO_HUMANLIKE=1 hard-disables (wins over --humanlike, for tests)."
|
|
14626
15702
|
}
|
|
14627
15703
|
};
|
|
14628
15704
|
const allowedAccountTypes = new Set([
|
|
@@ -14660,7 +15736,9 @@ function parseSharedArgs(args) {
|
|
|
14660
15736
|
showToken: args["show-token"],
|
|
14661
15737
|
proxyEnv: args["proxy-env"],
|
|
14662
15738
|
extendedBetas: args["extended-betas"],
|
|
14663
|
-
browseEnabled: args.browse
|
|
15739
|
+
browseEnabled: args.browse,
|
|
15740
|
+
powerBrowseEnabled: args["power-browse"],
|
|
15741
|
+
humanlikeEnabled: args.humanlike
|
|
14664
15742
|
};
|
|
14665
15743
|
}
|
|
14666
15744
|
/**
|
|
@@ -14900,7 +15978,8 @@ const claude = defineCommand({
|
|
|
14900
15978
|
geminiAvailable: geminiAvailable$1,
|
|
14901
15979
|
workerToolsAvailable: workerToolsEnabled(),
|
|
14902
15980
|
standInAvailable: standInToolEnabled(),
|
|
14903
|
-
browseAvailable: state.browseEnabled
|
|
15981
|
+
browseAvailable: state.browseEnabled,
|
|
15982
|
+
powerBrowseAvailable: state.powerBrowseEnabled
|
|
14904
15983
|
});
|
|
14905
15984
|
extraArgs.push("--append-system-prompt", peerSnippet);
|
|
14906
15985
|
try {
|