assistme 0.3.4 → 0.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,6 +12,12 @@ import type {
12
12
  ActionResult,
13
13
  RefActionResult,
14
14
  } from "./types.js";
15
+ import {
16
+ REF_CACHE_MAX_SIZE,
17
+ FRAME_CONTEXTS_MAX_SIZE,
18
+ CDP_COMMAND_TIMEOUT_MS,
19
+ WS_CONNECT_TIMEOUT_MS,
20
+ } from "../utils/constants.js";
15
21
 
16
22
  export class BrowserController {
17
23
  private ws: WebSocket | null = null;
@@ -21,6 +27,7 @@ export class BrowserController {
21
27
  private connected = false;
22
28
  private currentTabId: string | null = null;
23
29
  private refCache: Map<number, RefEntry> = new Map();
30
+ private frameContexts: Map<number, number> = new Map(); // refId → contextId
24
31
 
25
32
  constructor(port = 9222) {
26
33
  this.debugPort = port;
@@ -87,9 +94,9 @@ export class BrowserController {
87
94
  if (!settled) {
88
95
  settled = true;
89
96
  this.ws?.close();
90
- reject(new Error("Connection timeout (5s)"));
97
+ reject(new Error(`Connection timeout (${WS_CONNECT_TIMEOUT_MS}ms)`));
91
98
  }
92
- }, 5000);
99
+ }, WS_CONNECT_TIMEOUT_MS);
93
100
 
94
101
  this.ws.on("open", () => {
95
102
  if (settled) return;
@@ -142,6 +149,9 @@ export class BrowserController {
142
149
  this.ws = null;
143
150
  this.connected = false;
144
151
  }
152
+ // Clear caches to prevent memory leaks across task boundaries
153
+ this.refCache.clear();
154
+ this.frameContexts.clear();
145
155
  return "Disconnected from browser.";
146
156
  }
147
157
 
@@ -165,7 +175,7 @@ export class BrowserController {
165
175
  const timeout = setTimeout(() => {
166
176
  this.callbacks.delete(id);
167
177
  reject(new Error(`CDP command timed out: ${method}`));
168
- }, 15000);
178
+ }, CDP_COMMAND_TIMEOUT_MS);
169
179
 
170
180
  this.callbacks.set(id, (response) => {
171
181
  clearTimeout(timeout);
@@ -367,17 +377,50 @@ export class BrowserController {
367
377
  const selectorJS = JSON.stringify(selector);
368
378
  const textJS = JSON.stringify(text);
369
379
 
370
- // First clear and set value via JS, dispatching all relevant events
380
+ // First try to find the element in main document, then in same-origin iframes
371
381
  const result = await this.send("Runtime.evaluate", {
372
382
  expression: `
373
383
  (function() {
374
- const el = document.querySelector(${selectorJS});
384
+ var el = document.querySelector(${selectorJS});
385
+
386
+ // If not found in main document, search same-origin iframes
387
+ if (!el) {
388
+ var iframes = document.querySelectorAll('iframe');
389
+ for (var i = 0; i < iframes.length; i++) {
390
+ try {
391
+ var iframeDoc = iframes[i].contentDocument;
392
+ if (iframeDoc) {
393
+ el = iframeDoc.querySelector(${selectorJS});
394
+ if (el) break;
395
+ }
396
+ } catch(e) { /* cross-origin, skip */ }
397
+ }
398
+ }
399
+
375
400
  if (!el) return 'Element not found: ' + ${selectorJS};
376
401
 
377
402
  el.focus();
378
403
 
379
- // Clear existing value
380
- const nativeInputValueSetter = Object.getOwnPropertyDescriptor(
404
+ // Check if this is a contenteditable element (rich text editor)
405
+ var isContentEditable = el.isContentEditable ||
406
+ el.getAttribute('contenteditable') === 'true' ||
407
+ el.getAttribute('contenteditable') === '';
408
+
409
+ if (isContentEditable) {
410
+ // For contenteditable: select all content, then replace
411
+ var ownerDoc = el.ownerDocument;
412
+ var sel = ownerDoc.defaultView.getSelection();
413
+ var range = ownerDoc.createRange();
414
+ range.selectNodeContents(el);
415
+ sel.removeAllRanges();
416
+ sel.addRange(range);
417
+ // Use insertText command which respects undo stack and triggers input events
418
+ ownerDoc.execCommand('insertText', false, ${textJS});
419
+ return 'Typed into: ' + (el.tagName || '') + ' [contenteditable]';
420
+ }
421
+
422
+ // For input/textarea: clear and set value
423
+ var nativeInputValueSetter = Object.getOwnPropertyDescriptor(
381
424
  window.HTMLInputElement.prototype, 'value'
382
425
  )?.set || Object.getOwnPropertyDescriptor(
383
426
  window.HTMLTextAreaElement.prototype, 'value'
@@ -398,7 +441,36 @@ export class BrowserController {
398
441
  returnByValue: true,
399
442
  });
400
443
 
401
- return ((result as CDPEvalResult).result?.value as string) || "Text entered.";
444
+ const textResult = ((result as CDPEvalResult).result?.value as string) || "";
445
+
446
+ // If element still not found, try typing into the currently focused element via CDP
447
+ if (textResult.startsWith("Element not found")) {
448
+ return this.typeAtFocus(text);
449
+ }
450
+
451
+ return textResult || "Text entered.";
452
+ }
453
+
454
+ /**
455
+ * Type text into the currently focused element using CDP Input.insertText.
456
+ * This bypasses DOM queries entirely and works with any focused element,
457
+ * including those inside cross-origin iframes or shadow DOM.
458
+ */
459
+ async typeAtFocus(text: string): Promise<string> {
460
+ this.ensureConnected();
461
+
462
+ // Optionally clear existing content: select all then delete
463
+ const modKey = platform() === "darwin" ? "Meta" : "Control";
464
+ await this.pressKey(`${modKey}+a`);
465
+ await new Promise((r) => setTimeout(r, 50));
466
+ await this.pressKey("Backspace");
467
+ await new Promise((r) => setTimeout(r, 50));
468
+
469
+ // Insert text via CDP — goes through the browser's native input pipeline
470
+ await this.send("Input.insertText", { text });
471
+ await new Promise((r) => setTimeout(r, 100));
472
+
473
+ return "Text entered (into focused element).";
402
474
  }
403
475
 
404
476
  async pressKey(key: string): Promise<string> {
@@ -670,6 +742,9 @@ export class BrowserController {
670
742
  box: r.box as BoundingBox,
671
743
  }));
672
744
 
745
+ // 1b. Discover elements in cross-origin iframes via CDP frame targeting
746
+ await this.discoverCrossOriginFrameRefs(refs);
747
+
673
748
  // 2. Optionally inject visual overlay with ref labels
674
749
  // (Skip for dense pages — labels would overlap and become unreadable)
675
750
  if (annotate && refs.length <= 40) {
@@ -725,6 +800,8 @@ export class BrowserController {
725
800
  }
726
801
 
727
802
  // 5. Cache refs for subsequent act() calls
803
+ // Note: frameContexts is populated by discoverCrossOriginFrameRefs above,
804
+ // so we only clear refCache here (frameContexts was cleared at start of discover)
728
805
  this.refCache.clear();
729
806
  for (const ref of refs) {
730
807
  this.refCache.set(ref.id, ref);
@@ -752,6 +829,237 @@ export class BrowserController {
752
829
  return table;
753
830
  }
754
831
 
832
+ // ── Cross-Origin Iframe Discovery ────────────────────────────────
833
+
834
+ /**
835
+ * Use CDP's Page.getFrameTree + Runtime.evaluate with contextId to discover
836
+ * interactive elements inside cross-origin iframes (e.g., ProtonMail editor,
837
+ * Google Docs, embedded rich text editors).
838
+ *
839
+ * Same-origin iframes are already handled inline by the main snapshot JS.
840
+ * This method handles the ones that threw cross-origin errors.
841
+ */
842
+ private async discoverCrossOriginFrameRefs(refs: RefEntry[]): Promise<void> {
843
+ this.frameContexts.clear();
844
+ try {
845
+ // Get the frame tree to find all child frames
846
+ const frameTree = (await this.send("Page.getFrameTree")) as {
847
+ frameTree?: {
848
+ frame: { id: string };
849
+ childFrames?: Array<{ frame: { id: string; url: string; name?: string } }>;
850
+ };
851
+ };
852
+
853
+ const mainFrameId = frameTree.frameTree?.frame?.id;
854
+ const childFrames = frameTree.frameTree?.childFrames || [];
855
+ if (childFrames.length === 0) return;
856
+
857
+ // Get all execution contexts to map frameId → contextId
858
+ // We need to enable Runtime events and collect contexts
859
+ const contexts = await this.getFrameContexts(mainFrameId || "");
860
+
861
+ for (const child of childFrames) {
862
+ const frameId = child.frame.id;
863
+ const contextId = contexts.get(frameId);
864
+ if (!contextId) continue;
865
+
866
+ // Get the iframe's bounding rect from the parent frame for coordinate offset
867
+ const iframeOffsetResult = await this.send("Runtime.evaluate", {
868
+ expression: `
869
+ (function() {
870
+ var iframes = document.querySelectorAll('iframe');
871
+ for (var i = 0; i < iframes.length; i++) {
872
+ try {
873
+ // Match by frame src or name
874
+ var f = iframes[i];
875
+ if (f.contentWindow) {
876
+ var r = f.getBoundingClientRect();
877
+ if (r.width > 10 && r.height > 10) {
878
+ return JSON.stringify({ x: r.x, y: r.y, width: r.width, height: r.height, index: i });
879
+ }
880
+ }
881
+ } catch(e) {}
882
+ }
883
+ return 'null';
884
+ })()
885
+ `,
886
+ returnByValue: true,
887
+ });
888
+
889
+ let iframeOffset = { x: 0, y: 0 };
890
+ try {
891
+ const parsed = JSON.parse(
892
+ ((iframeOffsetResult as CDPEvalResult).result?.value as string) || "null"
893
+ );
894
+ if (parsed) iframeOffset = { x: parsed.x, y: parsed.y };
895
+ } catch {
896
+ /* ignore */
897
+ }
898
+
899
+ // Evaluate inside the child frame's execution context
900
+ const startRefId = refs.length + 1;
901
+ try {
902
+ const frameResult = await this.send("Runtime.evaluate", {
903
+ expression: `
904
+ (function() {
905
+ var selectors = [
906
+ 'a[href]', 'button', 'input:not([type="hidden"])', 'select', 'textarea',
907
+ '[role="button"]', '[role="link"]', '[role="checkbox"]', '[role="radio"]',
908
+ '[role="combobox"]', '[role="listbox"]', '[role="menuitem"]', '[role="tab"]',
909
+ '[role="switch"]', '[role="slider"]', '[role="option"]', '[role="searchbox"]',
910
+ '[onclick]', '[tabindex]:not([tabindex="-1"])',
911
+ '[contenteditable="true"]', '[contenteditable=""]'
912
+ ].join(', ');
913
+
914
+ var all = document.querySelectorAll(selectors);
915
+ // Also check if the body itself is contenteditable
916
+ if (document.body && (document.body.isContentEditable || document.body.getAttribute('contenteditable') === 'true')) {
917
+ all = [document.body].concat(Array.from(all));
918
+ }
919
+
920
+ var refs = [];
921
+ var startId = ${startRefId};
922
+ var vh = window.innerHeight;
923
+ var vw = window.innerWidth;
924
+
925
+ for (var i = 0; i < all.length && refs.length < 20; i++) {
926
+ var el = all[i];
927
+ var rect = el.getBoundingClientRect();
928
+ if (rect.width < 5 || rect.height < 5) continue;
929
+ var style = window.getComputedStyle(el);
930
+ if (style.display === 'none' || style.visibility === 'hidden' || style.opacity === '0') continue;
931
+
932
+ var role = el.getAttribute('role') || '';
933
+ if (!role) {
934
+ var tag = el.tagName.toLowerCase();
935
+ if (tag === 'a') role = 'link';
936
+ else if (tag === 'button') role = 'button';
937
+ else if (tag === 'input') {
938
+ var t = (el.type || 'text').toLowerCase();
939
+ if (t === 'checkbox') role = 'checkbox';
940
+ else if (t === 'radio') role = 'radio';
941
+ else if (t === 'submit' || t === 'button') role = 'button';
942
+ else role = 'textbox';
943
+ }
944
+ else if (tag === 'select') role = 'combobox';
945
+ else if (tag === 'textarea') role = 'textbox';
946
+ else if (el.isContentEditable) role = 'textbox';
947
+ else role = tag;
948
+ }
949
+
950
+ var name = '';
951
+ var ariaLabel = el.getAttribute('aria-label');
952
+ if (ariaLabel) {
953
+ name = ariaLabel;
954
+ } else if (el.tagName === 'INPUT' || el.tagName === 'TEXTAREA') {
955
+ name = el.getAttribute('placeholder') || el.getAttribute('name') || '';
956
+ } else if (el.isContentEditable) {
957
+ name = 'compose body';
958
+ } else {
959
+ name = (el.textContent || '').trim().slice(0, 60);
960
+ }
961
+
962
+ var refId = startId + refs.length;
963
+ el.setAttribute('data-assistme-ref', String(refId));
964
+
965
+ refs.push({
966
+ id: refId,
967
+ role: role,
968
+ name: name,
969
+ tag: el.tagName.toLowerCase(),
970
+ type: el.getAttribute('type') || '',
971
+ box: {
972
+ x: Math.round(rect.x),
973
+ y: Math.round(rect.y),
974
+ width: Math.round(rect.width),
975
+ height: Math.round(rect.height)
976
+ },
977
+ inFrame: true
978
+ });
979
+ }
980
+
981
+ return JSON.stringify(refs);
982
+ })()
983
+ `,
984
+ contextId,
985
+ returnByValue: true,
986
+ });
987
+
988
+ const frameRefs = JSON.parse(
989
+ ((frameResult as CDPEvalResult).result?.value as string) || "[]"
990
+ );
991
+
992
+ for (const r of frameRefs) {
993
+ refs.push({
994
+ id: r.id as number,
995
+ role: r.role as string,
996
+ name: r.name as string,
997
+ tag: r.tag as string,
998
+ inputType: (r.type as string) || "",
999
+ box: {
1000
+ x: Math.round((r.box.x as number) + iframeOffset.x),
1001
+ y: Math.round((r.box.y as number) + iframeOffset.y),
1002
+ width: r.box.width as number,
1003
+ height: r.box.height as number,
1004
+ },
1005
+ });
1006
+ // Store frame context for later resolution (with size guard)
1007
+ if (this.frameContexts.size < FRAME_CONTEXTS_MAX_SIZE) {
1008
+ this.frameContexts.set(r.id as number, contextId);
1009
+ }
1010
+ }
1011
+ } catch {
1012
+ // Frame evaluation failed (e.g., about:blank, pdf viewer) — skip
1013
+ }
1014
+ }
1015
+ } catch {
1016
+ // Frame tree unavailable — not critical, skip silently
1017
+ }
1018
+ }
1019
+
1020
+ /**
1021
+ * Get execution context IDs for each frame in the page.
1022
+ * Uses Runtime.executionContextCreated events collected during the session,
1023
+ * or falls back to evaluating in known frames.
1024
+ */
1025
+ private async getFrameContexts(_mainFrameId: string): Promise<Map<string, number>> {
1026
+ const contexts = new Map<string, number>();
1027
+ try {
1028
+ // Enable Runtime domain to get context descriptions (may already be enabled)
1029
+ await this.send("Runtime.enable").catch(() => {});
1030
+
1031
+ // Use Page.getFrameTree to get frame IDs, then try to create isolated worlds
1032
+ // for each frame to get their execution context IDs
1033
+ const frameTree = (await this.send("Page.getFrameTree")) as {
1034
+ frameTree?: {
1035
+ frame: { id: string };
1036
+ childFrames?: Array<{ frame: { id: string } }>;
1037
+ };
1038
+ };
1039
+
1040
+ const childFrames = frameTree.frameTree?.childFrames || [];
1041
+ for (const child of childFrames) {
1042
+ try {
1043
+ // Create an isolated world in the frame to get a context ID
1044
+ const world = (await this.send("Page.createIsolatedWorld", {
1045
+ frameId: child.frame.id,
1046
+ worldName: "assistme-snapshot",
1047
+ grantUniveralAccess: true,
1048
+ })) as { executionContextId?: number };
1049
+
1050
+ if (world.executionContextId) {
1051
+ contexts.set(child.frame.id, world.executionContextId);
1052
+ }
1053
+ } catch {
1054
+ // Frame might not support isolated worlds — skip
1055
+ }
1056
+ }
1057
+ } catch {
1058
+ // Fallback: no contexts available
1059
+ }
1060
+ return contexts;
1061
+ }
1062
+
755
1063
  // ── Ref Resolution ────────────────────────────────────────────────
756
1064
 
757
1065
  /**
@@ -869,9 +1177,103 @@ export class BrowserController {
869
1177
  });
870
1178
 
871
1179
  const value = (result as CDPEvalResult).result?.value as string;
872
- if (!value || value === "null") return null;
1180
+ if (value && value !== "null") {
1181
+ try {
1182
+ return JSON.parse(value);
1183
+ } catch {
1184
+ /* fall through to frame search */
1185
+ }
1186
+ }
1187
+
1188
+ // Strategy 3: search in cross-origin iframe contexts
1189
+ const frameContextId = this.frameContexts.get(refId);
1190
+ if (frameContextId) {
1191
+ return this.resolveRefInFrame(refId, frameContextId, role, name);
1192
+ }
1193
+
1194
+ return null;
1195
+ }
1196
+
1197
+ /**
1198
+ * Resolve a ref inside a cross-origin iframe using its execution context.
1199
+ * Returns coordinates adjusted by the iframe's viewport offset.
1200
+ */
1201
+ private async resolveRefInFrame(
1202
+ refId: number,
1203
+ contextId: number,
1204
+ role: string,
1205
+ name: string
1206
+ ): Promise<{ x: number; y: number; width: number; height: number; error?: string } | null> {
1207
+ const roleJS = JSON.stringify(role);
1208
+ const nameJS = JSON.stringify(name);
1209
+
873
1210
  try {
874
- return JSON.parse(value);
1211
+ // Get iframe offset from main document
1212
+ const offsetResult = await this.send("Runtime.evaluate", {
1213
+ expression: `
1214
+ (function() {
1215
+ var iframes = document.querySelectorAll('iframe');
1216
+ for (var i = 0; i < iframes.length; i++) {
1217
+ var r = iframes[i].getBoundingClientRect();
1218
+ if (r.width > 10 && r.height > 10) {
1219
+ return JSON.stringify({ x: r.x, y: r.y });
1220
+ }
1221
+ }
1222
+ return JSON.stringify({ x: 0, y: 0 });
1223
+ })()
1224
+ `,
1225
+ returnByValue: true,
1226
+ });
1227
+ const offset = JSON.parse(
1228
+ ((offsetResult as CDPEvalResult).result?.value as string) || '{"x":0,"y":0}'
1229
+ );
1230
+
1231
+ // Resolve element inside the frame
1232
+ const frameResult = await this.send("Runtime.evaluate", {
1233
+ expression: `
1234
+ (function() {
1235
+ var el = document.querySelector('[data-assistme-ref="${refId}"]');
1236
+ if (!el && ${roleJS} && ${nameJS}) {
1237
+ // Fallback: search by role
1238
+ var candidates = document.querySelectorAll('*');
1239
+ for (var i = 0; i < candidates.length; i++) {
1240
+ var c = candidates[i];
1241
+ if (c.isContentEditable || c.getAttribute('contenteditable') === 'true') {
1242
+ el = c; break;
1243
+ }
1244
+ }
1245
+ }
1246
+ if (!el) return 'null';
1247
+
1248
+ el.scrollIntoView({ block: 'center', behavior: 'instant' });
1249
+ var r = el.getBoundingClientRect();
1250
+ if (r.width < 1 || r.height < 1) return JSON.stringify({ error: 'Zero size' });
1251
+
1252
+ return JSON.stringify({
1253
+ x: r.x + r.width / 2,
1254
+ y: r.y + r.height / 2,
1255
+ width: r.width,
1256
+ height: r.height
1257
+ });
1258
+ })()
1259
+ `,
1260
+ contextId,
1261
+ returnByValue: true,
1262
+ });
1263
+
1264
+ const value = (frameResult as CDPEvalResult).result?.value as string;
1265
+ if (!value || value === "null") return null;
1266
+
1267
+ const parsed = JSON.parse(value);
1268
+ if (parsed.error) return parsed;
1269
+
1270
+ // Adjust coordinates by iframe offset
1271
+ return {
1272
+ x: parsed.x + offset.x,
1273
+ y: parsed.y + offset.y,
1274
+ width: parsed.width,
1275
+ height: parsed.height,
1276
+ };
875
1277
  } catch {
876
1278
  return null;
877
1279
  }
@@ -981,11 +1383,24 @@ export class BrowserController {
981
1383
  await new Promise((r) => setTimeout(r, 50));
982
1384
 
983
1385
  // 2. Verify the field is empty; if not, fall back to JS-based clearing
984
- const cleared = await this.send("Runtime.evaluate", {
1386
+ // Determine which context to evaluate in (main doc or iframe)
1387
+ const frameContextId = this.frameContexts.get(refId);
1388
+ const clearEvalOpts: Record<string, unknown> = {
985
1389
  expression: `
986
1390
  (function() {
987
1391
  var el = document.querySelector('[data-assistme-ref="${refId}"]');
988
1392
  if (!el) return 'no_element';
1393
+
1394
+ // For contenteditable elements, check textContent instead of value
1395
+ if (el.isContentEditable || el.getAttribute('contenteditable') === 'true') {
1396
+ if (el.textContent && el.textContent.trim() !== '') {
1397
+ el.textContent = '';
1398
+ el.dispatchEvent(new Event('input', { bubbles: true }));
1399
+ return 'js_cleared';
1400
+ }
1401
+ return 'ok';
1402
+ }
1403
+
989
1404
  if (el.value !== undefined && el.value !== '') {
990
1405
  // Ctrl+A didn't work (some frameworks intercept it) — clear via JS
991
1406
  var setter = Object.getOwnPropertyDescriptor(
@@ -1003,9 +1418,15 @@ export class BrowserController {
1003
1418
  })()
1004
1419
  `,
1005
1420
  returnByValue: true,
1006
- });
1421
+ };
1422
+ // If element is in a cross-origin iframe, evaluate in its context
1423
+ if (frameContextId) {
1424
+ clearEvalOpts.contextId = frameContextId;
1425
+ }
1426
+ const cleared = await this.send("Runtime.evaluate", clearEvalOpts);
1007
1427
  const clearStatus = ((cleared as CDPEvalResult).result?.value as string) || "ok";
1008
- if (clearStatus === "no_element") {
1428
+ if (clearStatus === "no_element" && !frameContextId) {
1429
+ // Element not found in main doc and no frame context — truly missing
1009
1430
  return {
1010
1431
  success: false,
1011
1432
  message: `Ref ${refLabel} not found after click. Take a new snapshot.`,
@@ -92,7 +92,7 @@ export function createBrowserMcpServer(): McpSdkServerConfigWithInstance {
92
92
  ),
93
93
  tool(
94
94
  "browser_type",
95
- "Type text into an input field in the user's browser.",
95
+ "Type text into an input field in the user's browser. If the CSS selector fails, automatically falls back to typing into the currently focused element. Works with contenteditable elements (rich text editors) and cross-origin iframes.",
96
96
  {
97
97
  selector: z.string().describe("CSS selector of the input element"),
98
98
  text: z.string().describe("Text to type"),