assistme 0.3.3 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -31,6 +31,55 @@ import {
31
31
  import { createEventHooks } from "./event-hooks.js";
32
32
  import { BASE_SYSTEM_PROMPT } from "./system-prompt.js";
33
33
 
34
+ /**
35
+ * Manages the task wall-clock timeout.
36
+ * Supports pausing while the agent is waiting for user input (ask_user)
37
+ * so that idle wait time doesn't count toward the timeout.
38
+ */
39
+ class TaskTimeout {
40
+ private timeoutId: ReturnType<typeof setTimeout> | null = null;
41
+ private remainingMs: number;
42
+ private resumedAt: number;
43
+
44
+ constructor(
45
+ private abortController: AbortController,
46
+ timeoutMs: number
47
+ ) {
48
+ this.remainingMs = timeoutMs;
49
+ this.resumedAt = Date.now();
50
+ this.schedule();
51
+ }
52
+
53
+ private schedule(): void {
54
+ this.timeoutId = setTimeout(() => {
55
+ this.abortController.abort();
56
+ }, this.remainingMs);
57
+ }
58
+
59
+ /** Pause the timeout (e.g. while waiting for user). */
60
+ pause(): void {
61
+ if (this.timeoutId) {
62
+ clearTimeout(this.timeoutId);
63
+ this.timeoutId = null;
64
+ const elapsed = Date.now() - this.resumedAt;
65
+ this.remainingMs = Math.max(0, this.remainingMs - elapsed);
66
+ }
67
+ }
68
+
69
+ /** Resume the timeout after user interaction completes. */
70
+ resume(): void {
71
+ this.resumedAt = Date.now();
72
+ this.schedule();
73
+ }
74
+
75
+ clear(): void {
76
+ if (this.timeoutId) {
77
+ clearTimeout(this.timeoutId);
78
+ this.timeoutId = null;
79
+ }
80
+ }
81
+ }
82
+
34
83
  const MAX_HISTORY_ENTRIES = 10;
35
84
  const MAX_RESPONSE_LENGTH = 1500;
36
85
 
@@ -143,6 +192,9 @@ export class TaskProcessor {
143
192
  systemPrompt += historyPrompt;
144
193
  }
145
194
 
195
+ const abortController = new AbortController();
196
+ const taskTimeout = new TaskTimeout(abortController, taskTimeoutMs);
197
+
146
198
  // Create MCP servers for custom tools
147
199
  const browserServer = createBrowserMcpServer();
148
200
  const agentToolsServer = createAgentToolsServer({
@@ -150,6 +202,8 @@ export class TaskProcessor {
150
202
  skillManager: this.skillManager,
151
203
  taskId: task.id,
152
204
  sessionId: this.sessionId || undefined,
205
+ onUserWaitStart: () => taskTimeout.pause(),
206
+ onUserWaitEnd: () => taskTimeout.resume(),
153
207
  });
154
208
 
155
209
  // Create event hooks for Supabase event emission
@@ -203,7 +257,6 @@ export class TaskProcessor {
203
257
  };
204
258
  }
205
259
 
206
- const abortController = new AbortController();
207
260
  const options: Options = {
208
261
  model: config.model,
209
262
  systemPrompt,
@@ -221,11 +274,7 @@ export class TaskProcessor {
221
274
  abortController,
222
275
  };
223
276
 
224
- // Wall-clock timeout via abort
225
277
  const taskStartTime = Date.now();
226
- const timeoutId = setTimeout(() => {
227
- abortController.abort();
228
- }, taskTimeoutMs);
229
278
 
230
279
  try {
231
280
  for await (const message of query({
@@ -302,7 +351,7 @@ export class TaskProcessor {
302
351
  }
303
352
  }
304
353
  } finally {
305
- clearTimeout(timeoutId);
354
+ taskTimeout.clear();
306
355
  }
307
356
 
308
357
  // Truncate finalResponse to avoid edge function payload limits
@@ -41,12 +41,16 @@ Available capabilities:
41
41
  - Bash tool for shell commands
42
42
  - Glob and Grep for file search
43
43
 
44
- 3. MEMORY:
44
+ 3. MEMORY & CREDENTIALS:
45
45
  - You can remember things about the user using memory_store
46
46
  - Use this when you learn preferences, important facts, or standing instructions
47
47
  - Your stored memories persist across conversations
48
48
  - PROACTIVELY use memory_store during tasks when you discover user preferences, habits, or important context
49
49
  - Before completing a task, consider if anything learned should be remembered for future conversations
50
+ - CRITICAL — Credential Storage: When you create, register, or receive any account credentials (username, password, API keys, tokens), you MUST use credential_set to save them locally. NEVER use memory_store for credentials — memory_store is for preferences and facts, credential_set is for secrets. Examples:
51
+ * After registering a new email/account → credential_set with type "login" and data { "username": "...", "password": "...", "email": "..." }
52
+ * After generating an API key → credential_set with type "api_key" and data { "api_key": "..." }
53
+ * Credentials saved via credential_set are encrypted on disk and viewable in the desktop app's Credentials panel
50
54
 
51
55
  4. SKILL-AWARE EXECUTION (CRITICAL — follow this for EVERY task):
52
56
  Step A — Search: Before executing ANY task, check if an existing skill matches (use skill_invoke or skill_search).
@@ -21,6 +21,7 @@ export class BrowserController {
21
21
  private connected = false;
22
22
  private currentTabId: string | null = null;
23
23
  private refCache: Map<number, RefEntry> = new Map();
24
+ private frameContexts: Map<number, number> = new Map(); // refId → contextId
24
25
 
25
26
  constructor(port = 9222) {
26
27
  this.debugPort = port;
@@ -367,17 +368,50 @@ export class BrowserController {
367
368
  const selectorJS = JSON.stringify(selector);
368
369
  const textJS = JSON.stringify(text);
369
370
 
370
- // First clear and set value via JS, dispatching all relevant events
371
+ // First try to find the element in main document, then in same-origin iframes
371
372
  const result = await this.send("Runtime.evaluate", {
372
373
  expression: `
373
374
  (function() {
374
- const el = document.querySelector(${selectorJS});
375
+ var el = document.querySelector(${selectorJS});
376
+
377
+ // If not found in main document, search same-origin iframes
378
+ if (!el) {
379
+ var iframes = document.querySelectorAll('iframe');
380
+ for (var i = 0; i < iframes.length; i++) {
381
+ try {
382
+ var iframeDoc = iframes[i].contentDocument;
383
+ if (iframeDoc) {
384
+ el = iframeDoc.querySelector(${selectorJS});
385
+ if (el) break;
386
+ }
387
+ } catch(e) { /* cross-origin, skip */ }
388
+ }
389
+ }
390
+
375
391
  if (!el) return 'Element not found: ' + ${selectorJS};
376
392
 
377
393
  el.focus();
378
394
 
379
- // Clear existing value
380
- const nativeInputValueSetter = Object.getOwnPropertyDescriptor(
395
+ // Check if this is a contenteditable element (rich text editor)
396
+ var isContentEditable = el.isContentEditable ||
397
+ el.getAttribute('contenteditable') === 'true' ||
398
+ el.getAttribute('contenteditable') === '';
399
+
400
+ if (isContentEditable) {
401
+ // For contenteditable: select all content, then replace
402
+ var ownerDoc = el.ownerDocument;
403
+ var sel = ownerDoc.defaultView.getSelection();
404
+ var range = ownerDoc.createRange();
405
+ range.selectNodeContents(el);
406
+ sel.removeAllRanges();
407
+ sel.addRange(range);
408
+ // Use insertText command which respects undo stack and triggers input events
409
+ ownerDoc.execCommand('insertText', false, ${textJS});
410
+ return 'Typed into: ' + (el.tagName || '') + ' [contenteditable]';
411
+ }
412
+
413
+ // For input/textarea: clear and set value
414
+ var nativeInputValueSetter = Object.getOwnPropertyDescriptor(
381
415
  window.HTMLInputElement.prototype, 'value'
382
416
  )?.set || Object.getOwnPropertyDescriptor(
383
417
  window.HTMLTextAreaElement.prototype, 'value'
@@ -398,7 +432,36 @@ export class BrowserController {
398
432
  returnByValue: true,
399
433
  });
400
434
 
401
- return ((result as CDPEvalResult).result?.value as string) || "Text entered.";
435
+ const textResult = ((result as CDPEvalResult).result?.value as string) || "";
436
+
437
+ // If element still not found, try typing into the currently focused element via CDP
438
+ if (textResult.startsWith("Element not found")) {
439
+ return this.typeAtFocus(text);
440
+ }
441
+
442
+ return textResult || "Text entered.";
443
+ }
444
+
445
+ /**
446
+ * Type text into the currently focused element using CDP Input.insertText.
447
+ * This bypasses DOM queries entirely and works with any focused element,
448
+ * including those inside cross-origin iframes or shadow DOM.
449
+ */
450
+ async typeAtFocus(text: string): Promise<string> {
451
+ this.ensureConnected();
452
+
453
+ // Optionally clear existing content: select all then delete
454
+ const modKey = platform() === "darwin" ? "Meta" : "Control";
455
+ await this.pressKey(`${modKey}+a`);
456
+ await new Promise((r) => setTimeout(r, 50));
457
+ await this.pressKey("Backspace");
458
+ await new Promise((r) => setTimeout(r, 50));
459
+
460
+ // Insert text via CDP — goes through the browser's native input pipeline
461
+ await this.send("Input.insertText", { text });
462
+ await new Promise((r) => setTimeout(r, 100));
463
+
464
+ return "Text entered (into focused element).";
402
465
  }
403
466
 
404
467
  async pressKey(key: string): Promise<string> {
@@ -670,6 +733,9 @@ export class BrowserController {
670
733
  box: r.box as BoundingBox,
671
734
  }));
672
735
 
736
+ // 1b. Discover elements in cross-origin iframes via CDP frame targeting
737
+ await this.discoverCrossOriginFrameRefs(refs);
738
+
673
739
  // 2. Optionally inject visual overlay with ref labels
674
740
  // (Skip for dense pages — labels would overlap and become unreadable)
675
741
  if (annotate && refs.length <= 40) {
@@ -725,6 +791,8 @@ export class BrowserController {
725
791
  }
726
792
 
727
793
  // 5. Cache refs for subsequent act() calls
794
+ // Note: frameContexts is populated by discoverCrossOriginFrameRefs above,
795
+ // so we only clear refCache here (frameContexts was cleared at start of discover)
728
796
  this.refCache.clear();
729
797
  for (const ref of refs) {
730
798
  this.refCache.set(ref.id, ref);
@@ -752,6 +820,235 @@ export class BrowserController {
752
820
  return table;
753
821
  }
754
822
 
823
+ // ── Cross-Origin Iframe Discovery ────────────────────────────────
824
+
825
+ /**
826
+ * Use CDP's Page.getFrameTree + Runtime.evaluate with contextId to discover
827
+ * interactive elements inside cross-origin iframes (e.g., ProtonMail editor,
828
+ * Google Docs, embedded rich text editors).
829
+ *
830
+ * Same-origin iframes are already handled inline by the main snapshot JS.
831
+ * This method handles the ones that threw cross-origin errors.
832
+ */
833
+ private async discoverCrossOriginFrameRefs(refs: RefEntry[]): Promise<void> {
834
+ this.frameContexts.clear();
835
+ try {
836
+ // Get the frame tree to find all child frames
837
+ const frameTree = (await this.send("Page.getFrameTree")) as {
838
+ frameTree?: {
839
+ frame: { id: string };
840
+ childFrames?: Array<{ frame: { id: string; url: string; name?: string } }>;
841
+ };
842
+ };
843
+
844
+ const mainFrameId = frameTree.frameTree?.frame?.id;
845
+ const childFrames = frameTree.frameTree?.childFrames || [];
846
+ if (childFrames.length === 0) return;
847
+
848
+ // Get all execution contexts to map frameId → contextId
849
+ // We need to enable Runtime events and collect contexts
850
+ const contexts = await this.getFrameContexts(mainFrameId || "");
851
+
852
+ for (const child of childFrames) {
853
+ const frameId = child.frame.id;
854
+ const contextId = contexts.get(frameId);
855
+ if (!contextId) continue;
856
+
857
+ // Get the iframe's bounding rect from the parent frame for coordinate offset
858
+ const iframeOffsetResult = await this.send("Runtime.evaluate", {
859
+ expression: `
860
+ (function() {
861
+ var iframes = document.querySelectorAll('iframe');
862
+ for (var i = 0; i < iframes.length; i++) {
863
+ try {
864
+ // Match by frame src or name
865
+ var f = iframes[i];
866
+ if (f.contentWindow) {
867
+ var r = f.getBoundingClientRect();
868
+ if (r.width > 10 && r.height > 10) {
869
+ return JSON.stringify({ x: r.x, y: r.y, width: r.width, height: r.height, index: i });
870
+ }
871
+ }
872
+ } catch(e) {}
873
+ }
874
+ return 'null';
875
+ })()
876
+ `,
877
+ returnByValue: true,
878
+ });
879
+
880
+ let iframeOffset = { x: 0, y: 0 };
881
+ try {
882
+ const parsed = JSON.parse(
883
+ ((iframeOffsetResult as CDPEvalResult).result?.value as string) || "null"
884
+ );
885
+ if (parsed) iframeOffset = { x: parsed.x, y: parsed.y };
886
+ } catch {
887
+ /* ignore */
888
+ }
889
+
890
+ // Evaluate inside the child frame's execution context
891
+ const startRefId = refs.length + 1;
892
+ try {
893
+ const frameResult = await this.send("Runtime.evaluate", {
894
+ expression: `
895
+ (function() {
896
+ var selectors = [
897
+ 'a[href]', 'button', 'input:not([type="hidden"])', 'select', 'textarea',
898
+ '[role="button"]', '[role="link"]', '[role="checkbox"]', '[role="radio"]',
899
+ '[role="combobox"]', '[role="listbox"]', '[role="menuitem"]', '[role="tab"]',
900
+ '[role="switch"]', '[role="slider"]', '[role="option"]', '[role="searchbox"]',
901
+ '[onclick]', '[tabindex]:not([tabindex="-1"])',
902
+ '[contenteditable="true"]', '[contenteditable=""]'
903
+ ].join(', ');
904
+
905
+ var all = document.querySelectorAll(selectors);
906
+ // Also check if the body itself is contenteditable
907
+ if (document.body && (document.body.isContentEditable || document.body.getAttribute('contenteditable') === 'true')) {
908
+ all = [document.body].concat(Array.from(all));
909
+ }
910
+
911
+ var refs = [];
912
+ var startId = ${startRefId};
913
+ var vh = window.innerHeight;
914
+ var vw = window.innerWidth;
915
+
916
+ for (var i = 0; i < all.length && refs.length < 20; i++) {
917
+ var el = all[i];
918
+ var rect = el.getBoundingClientRect();
919
+ if (rect.width < 5 || rect.height < 5) continue;
920
+ var style = window.getComputedStyle(el);
921
+ if (style.display === 'none' || style.visibility === 'hidden' || style.opacity === '0') continue;
922
+
923
+ var role = el.getAttribute('role') || '';
924
+ if (!role) {
925
+ var tag = el.tagName.toLowerCase();
926
+ if (tag === 'a') role = 'link';
927
+ else if (tag === 'button') role = 'button';
928
+ else if (tag === 'input') {
929
+ var t = (el.type || 'text').toLowerCase();
930
+ if (t === 'checkbox') role = 'checkbox';
931
+ else if (t === 'radio') role = 'radio';
932
+ else if (t === 'submit' || t === 'button') role = 'button';
933
+ else role = 'textbox';
934
+ }
935
+ else if (tag === 'select') role = 'combobox';
936
+ else if (tag === 'textarea') role = 'textbox';
937
+ else if (el.isContentEditable) role = 'textbox';
938
+ else role = tag;
939
+ }
940
+
941
+ var name = '';
942
+ var ariaLabel = el.getAttribute('aria-label');
943
+ if (ariaLabel) {
944
+ name = ariaLabel;
945
+ } else if (el.tagName === 'INPUT' || el.tagName === 'TEXTAREA') {
946
+ name = el.getAttribute('placeholder') || el.getAttribute('name') || '';
947
+ } else if (el.isContentEditable) {
948
+ name = 'compose body';
949
+ } else {
950
+ name = (el.textContent || '').trim().slice(0, 60);
951
+ }
952
+
953
+ var refId = startId + refs.length;
954
+ el.setAttribute('data-assistme-ref', String(refId));
955
+
956
+ refs.push({
957
+ id: refId,
958
+ role: role,
959
+ name: name,
960
+ tag: el.tagName.toLowerCase(),
961
+ type: el.getAttribute('type') || '',
962
+ box: {
963
+ x: Math.round(rect.x),
964
+ y: Math.round(rect.y),
965
+ width: Math.round(rect.width),
966
+ height: Math.round(rect.height)
967
+ },
968
+ inFrame: true
969
+ });
970
+ }
971
+
972
+ return JSON.stringify(refs);
973
+ })()
974
+ `,
975
+ contextId,
976
+ returnByValue: true,
977
+ });
978
+
979
+ const frameRefs = JSON.parse(
980
+ ((frameResult as CDPEvalResult).result?.value as string) || "[]"
981
+ );
982
+
983
+ for (const r of frameRefs) {
984
+ refs.push({
985
+ id: r.id as number,
986
+ role: r.role as string,
987
+ name: r.name as string,
988
+ tag: r.tag as string,
989
+ inputType: (r.type as string) || "",
990
+ box: {
991
+ x: Math.round((r.box.x as number) + iframeOffset.x),
992
+ y: Math.round((r.box.y as number) + iframeOffset.y),
993
+ width: r.box.width as number,
994
+ height: r.box.height as number,
995
+ },
996
+ });
997
+ // Store frame context for later resolution
998
+ this.frameContexts.set(r.id as number, contextId);
999
+ }
1000
+ } catch {
1001
+ // Frame evaluation failed (e.g., about:blank, pdf viewer) — skip
1002
+ }
1003
+ }
1004
+ } catch {
1005
+ // Frame tree unavailable — not critical, skip silently
1006
+ }
1007
+ }
1008
+
1009
+ /**
1010
+ * Get execution context IDs for each frame in the page.
1011
+ * Uses Runtime.executionContextCreated events collected during the session,
1012
+ * or falls back to evaluating in known frames.
1013
+ */
1014
+ private async getFrameContexts(_mainFrameId: string): Promise<Map<string, number>> {
1015
+ const contexts = new Map<string, number>();
1016
+ try {
1017
+ // Enable Runtime domain to get context descriptions (may already be enabled)
1018
+ await this.send("Runtime.enable").catch(() => {});
1019
+
1020
+ // Use Page.getFrameTree to get frame IDs, then try to create isolated worlds
1021
+ // for each frame to get their execution context IDs
1022
+ const frameTree = (await this.send("Page.getFrameTree")) as {
1023
+ frameTree?: {
1024
+ frame: { id: string };
1025
+ childFrames?: Array<{ frame: { id: string } }>;
1026
+ };
1027
+ };
1028
+
1029
+ const childFrames = frameTree.frameTree?.childFrames || [];
1030
+ for (const child of childFrames) {
1031
+ try {
1032
+ // Create an isolated world in the frame to get a context ID
1033
+ const world = (await this.send("Page.createIsolatedWorld", {
1034
+ frameId: child.frame.id,
1035
+ worldName: "assistme-snapshot",
1036
+ grantUniveralAccess: true,
1037
+ })) as { executionContextId?: number };
1038
+
1039
+ if (world.executionContextId) {
1040
+ contexts.set(child.frame.id, world.executionContextId);
1041
+ }
1042
+ } catch {
1043
+ // Frame might not support isolated worlds — skip
1044
+ }
1045
+ }
1046
+ } catch {
1047
+ // Fallback: no contexts available
1048
+ }
1049
+ return contexts;
1050
+ }
1051
+
755
1052
  // ── Ref Resolution ────────────────────────────────────────────────
756
1053
 
757
1054
  /**
@@ -869,9 +1166,103 @@ export class BrowserController {
869
1166
  });
870
1167
 
871
1168
  const value = (result as CDPEvalResult).result?.value as string;
872
- if (!value || value === "null") return null;
1169
+ if (value && value !== "null") {
1170
+ try {
1171
+ return JSON.parse(value);
1172
+ } catch {
1173
+ /* fall through to frame search */
1174
+ }
1175
+ }
1176
+
1177
+ // Strategy 3: search in cross-origin iframe contexts
1178
+ const frameContextId = this.frameContexts.get(refId);
1179
+ if (frameContextId) {
1180
+ return this.resolveRefInFrame(refId, frameContextId, role, name);
1181
+ }
1182
+
1183
+ return null;
1184
+ }
1185
+
1186
+ /**
1187
+ * Resolve a ref inside a cross-origin iframe using its execution context.
1188
+ * Returns coordinates adjusted by the iframe's viewport offset.
1189
+ */
1190
+ private async resolveRefInFrame(
1191
+ refId: number,
1192
+ contextId: number,
1193
+ role: string,
1194
+ name: string
1195
+ ): Promise<{ x: number; y: number; width: number; height: number; error?: string } | null> {
1196
+ const roleJS = JSON.stringify(role);
1197
+ const nameJS = JSON.stringify(name);
1198
+
873
1199
  try {
874
- return JSON.parse(value);
1200
+ // Get iframe offset from main document
1201
+ const offsetResult = await this.send("Runtime.evaluate", {
1202
+ expression: `
1203
+ (function() {
1204
+ var iframes = document.querySelectorAll('iframe');
1205
+ for (var i = 0; i < iframes.length; i++) {
1206
+ var r = iframes[i].getBoundingClientRect();
1207
+ if (r.width > 10 && r.height > 10) {
1208
+ return JSON.stringify({ x: r.x, y: r.y });
1209
+ }
1210
+ }
1211
+ return JSON.stringify({ x: 0, y: 0 });
1212
+ })()
1213
+ `,
1214
+ returnByValue: true,
1215
+ });
1216
+ const offset = JSON.parse(
1217
+ ((offsetResult as CDPEvalResult).result?.value as string) || '{"x":0,"y":0}'
1218
+ );
1219
+
1220
+ // Resolve element inside the frame
1221
+ const frameResult = await this.send("Runtime.evaluate", {
1222
+ expression: `
1223
+ (function() {
1224
+ var el = document.querySelector('[data-assistme-ref="${refId}"]');
1225
+ if (!el && ${roleJS} && ${nameJS}) {
1226
+ // Fallback: search by role
1227
+ var candidates = document.querySelectorAll('*');
1228
+ for (var i = 0; i < candidates.length; i++) {
1229
+ var c = candidates[i];
1230
+ if (c.isContentEditable || c.getAttribute('contenteditable') === 'true') {
1231
+ el = c; break;
1232
+ }
1233
+ }
1234
+ }
1235
+ if (!el) return 'null';
1236
+
1237
+ el.scrollIntoView({ block: 'center', behavior: 'instant' });
1238
+ var r = el.getBoundingClientRect();
1239
+ if (r.width < 1 || r.height < 1) return JSON.stringify({ error: 'Zero size' });
1240
+
1241
+ return JSON.stringify({
1242
+ x: r.x + r.width / 2,
1243
+ y: r.y + r.height / 2,
1244
+ width: r.width,
1245
+ height: r.height
1246
+ });
1247
+ })()
1248
+ `,
1249
+ contextId,
1250
+ returnByValue: true,
1251
+ });
1252
+
1253
+ const value = (frameResult as CDPEvalResult).result?.value as string;
1254
+ if (!value || value === "null") return null;
1255
+
1256
+ const parsed = JSON.parse(value);
1257
+ if (parsed.error) return parsed;
1258
+
1259
+ // Adjust coordinates by iframe offset
1260
+ return {
1261
+ x: parsed.x + offset.x,
1262
+ y: parsed.y + offset.y,
1263
+ width: parsed.width,
1264
+ height: parsed.height,
1265
+ };
875
1266
  } catch {
876
1267
  return null;
877
1268
  }
@@ -981,11 +1372,24 @@ export class BrowserController {
981
1372
  await new Promise((r) => setTimeout(r, 50));
982
1373
 
983
1374
  // 2. Verify the field is empty; if not, fall back to JS-based clearing
984
- const cleared = await this.send("Runtime.evaluate", {
1375
+ // Determine which context to evaluate in (main doc or iframe)
1376
+ const frameContextId = this.frameContexts.get(refId);
1377
+ const clearEvalOpts: Record<string, unknown> = {
985
1378
  expression: `
986
1379
  (function() {
987
1380
  var el = document.querySelector('[data-assistme-ref="${refId}"]');
988
1381
  if (!el) return 'no_element';
1382
+
1383
+ // For contenteditable elements, check textContent instead of value
1384
+ if (el.isContentEditable || el.getAttribute('contenteditable') === 'true') {
1385
+ if (el.textContent && el.textContent.trim() !== '') {
1386
+ el.textContent = '';
1387
+ el.dispatchEvent(new Event('input', { bubbles: true }));
1388
+ return 'js_cleared';
1389
+ }
1390
+ return 'ok';
1391
+ }
1392
+
989
1393
  if (el.value !== undefined && el.value !== '') {
990
1394
  // Ctrl+A didn't work (some frameworks intercept it) — clear via JS
991
1395
  var setter = Object.getOwnPropertyDescriptor(
@@ -1003,9 +1407,15 @@ export class BrowserController {
1003
1407
  })()
1004
1408
  `,
1005
1409
  returnByValue: true,
1006
- });
1410
+ };
1411
+ // If element is in a cross-origin iframe, evaluate in its context
1412
+ if (frameContextId) {
1413
+ clearEvalOpts.contextId = frameContextId;
1414
+ }
1415
+ const cleared = await this.send("Runtime.evaluate", clearEvalOpts);
1007
1416
  const clearStatus = ((cleared as CDPEvalResult).result?.value as string) || "ok";
1008
- if (clearStatus === "no_element") {
1417
+ if (clearStatus === "no_element" && !frameContextId) {
1418
+ // Element not found in main doc and no frame context — truly missing
1009
1419
  return {
1010
1420
  success: false,
1011
1421
  message: `Ref ${refLabel} not found after click. Take a new snapshot.`,
package/src/db/event.ts CHANGED
@@ -2,6 +2,36 @@ import { callMcpHandler } from "./api-client.js";
2
2
  import { log } from "../utils/logger.js";
3
3
  import type { EventType } from "./types.js";
4
4
 
5
+ const MAX_EMIT_RETRIES = 2;
6
+ const EMIT_RETRY_DELAY_MS = 500;
7
+
8
+ async function emitWithRetry(
9
+ messageId: string,
10
+ eventType: EventType,
11
+ eventData: Record<string, unknown>,
12
+ seq: number
13
+ ): Promise<void> {
14
+ for (let attempt = 0; attempt <= MAX_EMIT_RETRIES; attempt++) {
15
+ try {
16
+ await callMcpHandler("event.emit", {
17
+ message_id: messageId,
18
+ event_type: eventType,
19
+ event_data: eventData,
20
+ seq,
21
+ });
22
+ return;
23
+ } catch (err) {
24
+ if (attempt < MAX_EMIT_RETRIES) {
25
+ await new Promise((r) => setTimeout(r, EMIT_RETRY_DELAY_MS * (attempt + 1)));
26
+ } else {
27
+ log.warn(
28
+ `Failed to emit event after ${MAX_EMIT_RETRIES + 1} attempts: ${err instanceof Error ? err.message : err}`
29
+ );
30
+ }
31
+ }
32
+ }
33
+ }
34
+
5
35
  /**
6
36
  * Per-task event emitter. Each task gets its own sequence counter
7
37
  * to avoid cross-task sequence number collisions.
@@ -13,16 +43,7 @@ export class TaskEventEmitter {
13
43
 
14
44
  async emit(eventType: EventType, eventData: Record<string, unknown>): Promise<void> {
15
45
  this.sequence++;
16
- try {
17
- await callMcpHandler("event.emit", {
18
- message_id: this.messageId,
19
- event_type: eventType,
20
- event_data: eventData,
21
- seq: this.sequence,
22
- });
23
- } catch (err) {
24
- log.warn(`Failed to emit event: ${err instanceof Error ? err.message : err}`);
25
- }
46
+ await emitWithRetry(this.messageId, eventType, eventData, this.sequence);
26
47
  }
27
48
  }
28
49
 
@@ -39,14 +60,5 @@ export async function emitEvent(
39
60
  eventData: Record<string, unknown>
40
61
  ): Promise<void> {
41
62
  eventSequence++;
42
- try {
43
- await callMcpHandler("event.emit", {
44
- message_id: messageId,
45
- event_type: eventType,
46
- event_data: eventData,
47
- seq: eventSequence,
48
- });
49
- } catch (err) {
50
- log.warn(`Failed to emit event: ${err instanceof Error ? err.message : err}`);
51
- }
63
+ await emitWithRetry(messageId, eventType, eventData, eventSequence);
52
64
  }