assistme 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "assistme",
3
- "version": "0.3.2",
3
+ "version": "0.3.4",
4
4
  "description": "AssistMe CLI Agent - AI-powered assistant that controls your real browser",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -31,6 +31,55 @@ import {
31
31
  import { createEventHooks } from "./event-hooks.js";
32
32
  import { BASE_SYSTEM_PROMPT } from "./system-prompt.js";
33
33
 
34
+ /**
35
+ * Manages the task wall-clock timeout.
36
+ * Supports pausing while the agent is waiting for user input (ask_user)
37
+ * so that idle wait time doesn't count toward the timeout.
38
+ */
39
+ class TaskTimeout {
40
+ private timeoutId: ReturnType<typeof setTimeout> | null = null;
41
+ private remainingMs: number;
42
+ private resumedAt: number;
43
+
44
+ constructor(
45
+ private abortController: AbortController,
46
+ timeoutMs: number
47
+ ) {
48
+ this.remainingMs = timeoutMs;
49
+ this.resumedAt = Date.now();
50
+ this.schedule();
51
+ }
52
+
53
+ private schedule(): void {
54
+ this.timeoutId = setTimeout(() => {
55
+ this.abortController.abort();
56
+ }, this.remainingMs);
57
+ }
58
+
59
+ /** Pause the timeout (e.g. while waiting for user). */
60
+ pause(): void {
61
+ if (this.timeoutId) {
62
+ clearTimeout(this.timeoutId);
63
+ this.timeoutId = null;
64
+ const elapsed = Date.now() - this.resumedAt;
65
+ this.remainingMs = Math.max(0, this.remainingMs - elapsed);
66
+ }
67
+ }
68
+
69
+ /** Resume the timeout after user interaction completes. */
70
+ resume(): void {
71
+ this.resumedAt = Date.now();
72
+ this.schedule();
73
+ }
74
+
75
+ clear(): void {
76
+ if (this.timeoutId) {
77
+ clearTimeout(this.timeoutId);
78
+ this.timeoutId = null;
79
+ }
80
+ }
81
+ }
82
+
34
83
  const MAX_HISTORY_ENTRIES = 10;
35
84
  const MAX_RESPONSE_LENGTH = 1500;
36
85
 
@@ -143,6 +192,9 @@ export class TaskProcessor {
143
192
  systemPrompt += historyPrompt;
144
193
  }
145
194
 
195
+ const abortController = new AbortController();
196
+ const taskTimeout = new TaskTimeout(abortController, taskTimeoutMs);
197
+
146
198
  // Create MCP servers for custom tools
147
199
  const browserServer = createBrowserMcpServer();
148
200
  const agentToolsServer = createAgentToolsServer({
@@ -150,6 +202,8 @@ export class TaskProcessor {
150
202
  skillManager: this.skillManager,
151
203
  taskId: task.id,
152
204
  sessionId: this.sessionId || undefined,
205
+ onUserWaitStart: () => taskTimeout.pause(),
206
+ onUserWaitEnd: () => taskTimeout.resume(),
153
207
  });
154
208
 
155
209
  // Create event hooks for Supabase event emission
@@ -203,7 +257,6 @@ export class TaskProcessor {
203
257
  };
204
258
  }
205
259
 
206
- const abortController = new AbortController();
207
260
  const options: Options = {
208
261
  model: config.model,
209
262
  systemPrompt,
@@ -221,11 +274,7 @@ export class TaskProcessor {
221
274
  abortController,
222
275
  };
223
276
 
224
- // Wall-clock timeout via abort
225
277
  const taskStartTime = Date.now();
226
- const timeoutId = setTimeout(() => {
227
- abortController.abort();
228
- }, taskTimeoutMs);
229
278
 
230
279
  try {
231
280
  for await (const message of query({
@@ -302,11 +351,18 @@ export class TaskProcessor {
302
351
  }
303
352
  }
304
353
  } finally {
305
- clearTimeout(timeoutId);
354
+ taskTimeout.clear();
306
355
  }
307
356
 
357
+ // Truncate finalResponse to avoid edge function payload limits
358
+ const MAX_CONTENT_LENGTH = 50_000;
359
+ const truncatedResponse =
360
+ finalResponse.length > MAX_CONTENT_LENGTH
361
+ ? finalResponse.slice(0, MAX_CONTENT_LENGTH) + "\n\n[Response truncated]"
362
+ : finalResponse;
363
+
308
364
  // Complete the task (with retry for transient DB failures)
309
- await withRetry(() => completeTask(task.id, finalResponse, tokenUsage), {
365
+ await withRetry(() => completeTask(task.id, truncatedResponse, tokenUsage), {
310
366
  maxRetries: 2,
311
367
  baseDelayMs: 300,
312
368
  label: "completeTask",
@@ -28,7 +28,7 @@ Available capabilities:
28
28
  - Refs persist across actions unless the page navigates. Re-snapshot after navigation or major DOM changes.
29
29
 
30
30
  **Legacy tools (still available, use when refs don't work):**
31
- - browser_click, browser_type, browser_select, browser_get_elements, browser_screenshot, browser_evaluate
31
+ - browser_click, browser_type, browser_select, browser_screenshot, browser_evaluate
32
32
  - browser_click supports :contains('text') pseudo-selectors
33
33
  - browser_select handles native and custom dropdowns
34
34
 
@@ -41,12 +41,16 @@ Available capabilities:
41
41
  - Bash tool for shell commands
42
42
  - Glob and Grep for file search
43
43
 
44
- 3. MEMORY:
44
+ 3. MEMORY & CREDENTIALS:
45
45
  - You can remember things about the user using memory_store
46
46
  - Use this when you learn preferences, important facts, or standing instructions
47
47
  - Your stored memories persist across conversations
48
48
  - PROACTIVELY use memory_store during tasks when you discover user preferences, habits, or important context
49
49
  - Before completing a task, consider if anything learned should be remembered for future conversations
50
+ - CRITICAL — Credential Storage: When you create, register, or receive any account credentials (username, password, API keys, tokens), you MUST use credential_set to save them locally. NEVER use memory_store for credentials — memory_store is for preferences and facts, credential_set is for secrets. Examples:
51
+ * After registering a new email/account → credential_set with type "login" and data { "username": "...", "password": "...", "email": "..." }
52
+ * After generating an API key → credential_set with type "api_key" and data { "api_key": "..." }
53
+ * Credentials saved via credential_set are encrypted on disk and viewable in the desktop app's Credentials panel
50
54
 
51
55
  4. SKILL-AWARE EXECUTION (CRITICAL — follow this for EVERY task):
52
56
  Step A — Search: Before executing ANY task, check if an existing skill matches (use skill_invoke or skill_search).
@@ -545,11 +545,13 @@ export async function ensureBrowserAvailable(port = 9222): Promise<AutoLaunchRes
545
545
 
546
546
  // ── Singleton ───────────────────────────────────────────────────────
547
547
 
548
- let browserInstance: BrowserController | null = null;
548
+ const browserInstances = new Map<number, BrowserController>();
549
549
 
550
550
  export function getBrowser(port = 9222): BrowserController {
551
- if (!browserInstance) {
552
- browserInstance = new BrowserController(port);
551
+ let instance = browserInstances.get(port);
552
+ if (!instance) {
553
+ instance = new BrowserController(port);
554
+ browserInstances.set(port, instance);
553
555
  }
554
- return browserInstance;
556
+ return instance;
555
557
  }
@@ -10,6 +10,7 @@ import type {
10
10
  SnapshotResult,
11
11
  ActionSpec,
12
12
  ActionResult,
13
+ RefActionResult,
13
14
  } from "./types.js";
14
15
 
15
16
  export class BrowserController {
@@ -198,11 +199,26 @@ export class BrowserController {
198
199
 
199
200
  async goBack(): Promise<string> {
200
201
  this.ensureConnected();
201
- await this.send("Page.navigateToHistoryEntry", {
202
- entryId: -1,
203
- }).catch(() => {});
204
- // Fallback: use JS
205
- await this.evaluate("window.history.back()");
202
+ try {
203
+ // Get navigation history and go to the previous entry
204
+ const history = (await this.send("Page.getNavigationHistory")) as {
205
+ currentIndex?: number;
206
+ entries?: Array<{ id: number }>;
207
+ };
208
+ const idx = history.currentIndex ?? 0;
209
+ const entries = history.entries ?? [];
210
+ if (idx > 0 && entries[idx - 1]) {
211
+ await this.send("Page.navigateToHistoryEntry", {
212
+ entryId: entries[idx - 1].id,
213
+ });
214
+ } else {
215
+ // No previous entry in CDP history — use JS fallback
216
+ await this.evaluate("window.history.back()");
217
+ }
218
+ } catch {
219
+ // CDP history API failed — use JS fallback
220
+ await this.evaluate("window.history.back()");
221
+ }
206
222
  await this.waitForLoad();
207
223
  const info = await this.getPageInfo();
208
224
  return `Went back to: ${info.title}`;
@@ -394,31 +410,88 @@ export class BrowserController {
394
410
  Tab: { keyCode: 9, code: "Tab" },
395
411
  Escape: { keyCode: 27, code: "Escape" },
396
412
  Backspace: { keyCode: 8, code: "Backspace" },
413
+ Delete: { keyCode: 46, code: "Delete" },
397
414
  ArrowDown: { keyCode: 40, code: "ArrowDown" },
398
415
  ArrowUp: { keyCode: 38, code: "ArrowUp" },
416
+ ArrowLeft: { keyCode: 37, code: "ArrowLeft" },
417
+ ArrowRight: { keyCode: 39, code: "ArrowRight" },
418
+ Home: { keyCode: 36, code: "Home" },
419
+ End: { keyCode: 35, code: "End" },
420
+ Space: { keyCode: 32, code: "Space" },
399
421
  };
400
422
 
401
- const mapped = keyMap[key];
423
+ // CDP modifier bitmask values
424
+ const modifierMap: Record<string, number> = {
425
+ Alt: 1,
426
+ Control: 2,
427
+ Meta: 4,
428
+ Shift: 8,
429
+ };
430
+
431
+ // Parse modifier combos like "Control+a", "Meta+Shift+z"
432
+ const parts = key.split("+");
433
+ let modifiers = 0;
434
+ let actualKey = parts[parts.length - 1];
435
+ for (let i = 0; i < parts.length - 1; i++) {
436
+ const mod = modifierMap[parts[i]];
437
+ if (mod) modifiers |= mod;
438
+ }
439
+
440
+ const mapped = keyMap[actualKey];
402
441
  if (mapped) {
403
442
  await this.send("Input.dispatchKeyEvent", {
404
443
  type: "keyDown",
405
- key,
444
+ key: actualKey,
406
445
  code: mapped.code,
407
446
  windowsVirtualKeyCode: mapped.keyCode,
408
447
  nativeVirtualKeyCode: mapped.keyCode,
448
+ modifiers,
409
449
  });
410
450
  await this.send("Input.dispatchKeyEvent", {
411
451
  type: "keyUp",
412
- key,
452
+ key: actualKey,
413
453
  code: mapped.code,
414
454
  windowsVirtualKeyCode: mapped.keyCode,
415
455
  nativeVirtualKeyCode: mapped.keyCode,
456
+ modifiers,
457
+ });
458
+ } else if (actualKey.length === 1) {
459
+ // Single character key (e.g., "a", "z")
460
+ const code = `Key${actualKey.toUpperCase()}`;
461
+ const keyCode = actualKey.toUpperCase().charCodeAt(0);
462
+ await this.send("Input.dispatchKeyEvent", {
463
+ type: "keyDown",
464
+ key: actualKey,
465
+ code,
466
+ windowsVirtualKeyCode: keyCode,
467
+ nativeVirtualKeyCode: keyCode,
468
+ modifiers,
469
+ });
470
+ if (!modifiers) {
471
+ // Only insert text for unmodified single characters
472
+ await this.send("Input.dispatchKeyEvent", {
473
+ type: "char",
474
+ text: actualKey,
475
+ modifiers,
476
+ });
477
+ }
478
+ await this.send("Input.dispatchKeyEvent", {
479
+ type: "keyUp",
480
+ key: actualKey,
481
+ code,
482
+ modifiers,
416
483
  });
417
484
  } else {
418
- // Single character key
485
+ // Unknown key name — try as-is
419
486
  await this.send("Input.dispatchKeyEvent", {
420
- type: "char",
421
- text: key,
487
+ type: "keyDown",
488
+ key: actualKey,
489
+ modifiers,
490
+ });
491
+ await this.send("Input.dispatchKeyEvent", {
492
+ type: "keyUp",
493
+ key: actualKey,
494
+ modifiers,
422
495
  });
423
496
  }
424
497
 
@@ -816,8 +889,10 @@ export class BrowserController {
816
889
  * element is not yet actionable (e.g., covered by a loading overlay, still
817
890
  * animating into view). This matches Playwright's auto-waiting behavior.
818
891
  */
819
- async clickRef(refId: number): Promise<string> {
892
+ async clickRef(refId: number): Promise<RefActionResult> {
820
893
  this.ensureConnected();
894
+ const ref = this.refCache.get(refId);
895
+ const refLabel = `[${refId}] ${ref?.role || ""} "${ref?.name || ""}"`;
821
896
 
822
897
  // Auto-wait: retry up to 3 times if element is not actionable yet
823
898
  const maxRetries = 3;
@@ -827,7 +902,10 @@ export class BrowserController {
827
902
  const resolved = await this.resolveRef(refId);
828
903
 
829
904
  if (!resolved) {
830
- return `Ref [${refId}] not found. Take a new snapshot with browser_snapshot.`;
905
+ return {
906
+ success: false,
907
+ message: `Ref ${refLabel} not found. Take a new snapshot with browser_snapshot.`,
908
+ };
831
909
  }
832
910
 
833
911
  if (resolved.error) {
@@ -837,9 +915,7 @@ export class BrowserController {
837
915
  await new Promise((r) => setTimeout(r, 500));
838
916
  continue;
839
917
  }
840
- // Final attempt failed report the actionability issue
841
- const ref = this.refCache.get(refId);
842
- return `Cannot click [${refId}] ${ref?.role || ""} "${ref?.name || ""}": ${lastError}`;
918
+ return { success: false, message: `Cannot click ${refLabel}: ${lastError}` };
843
919
  }
844
920
 
845
921
  // Element is actionable — small delay after scroll for rendering
@@ -875,13 +951,10 @@ export class BrowserController {
875
951
  });
876
952
 
877
953
  await new Promise((r) => setTimeout(r, 300));
878
- const ref = this.refCache.get(refId);
879
- return `Clicked [${refId}] ${ref?.role || ""} "${ref?.name || ""}"`;
954
+ return { success: true, message: `Clicked ${refLabel}` };
880
955
  }
881
956
 
882
- // Should not reach here, but just in case
883
- const ref = this.refCache.get(refId);
884
- return `Cannot click [${refId}] ${ref?.role || ""} "${ref?.name || ""}": ${lastError}`;
957
+ return { success: false, message: `Cannot click ${refLabel}: ${lastError}` };
885
958
  }
886
959
 
887
960
  /**
@@ -889,48 +962,61 @@ export class BrowserController {
889
962
  * Clicks to focus, selects all existing text (Ctrl/Cmd+A), then uses
890
963
  * Input.insertText for reliable text insertion across all frameworks.
891
964
  */
892
- async typeRef(refId: number, text: string): Promise<string> {
965
+ async typeRef(refId: number, text: string): Promise<RefActionResult> {
893
966
  this.ensureConnected();
967
+ const ref = this.refCache.get(refId);
968
+ const refLabel = `[${refId}] ${ref?.role || ""} "${ref?.name || ""}"`;
894
969
 
895
970
  // Click to focus the element
896
971
  const clickResult = await this.clickRef(refId);
897
- if (clickResult.includes("not found")) return clickResult;
972
+ if (!clickResult.success) return clickResult;
898
973
  await new Promise((r) => setTimeout(r, 100));
899
974
 
900
- // Select all existing text (Cmd+A on macOS, Ctrl+A elsewhere)
901
- const modifier = platform() === "darwin" ? 4 : 2;
902
- await this.send("Input.dispatchKeyEvent", {
903
- type: "keyDown",
904
- modifiers: modifier,
905
- key: "a",
906
- code: "KeyA",
907
- windowsVirtualKeyCode: 65,
908
- });
909
- await this.send("Input.dispatchKeyEvent", {
910
- type: "keyUp",
911
- key: "a",
912
- code: "KeyA",
913
- });
975
+ // Clear existing text using multiple strategies for reliability:
976
+ // 1. Try Ctrl/Cmd+A to select all, then Backspace to delete
977
+ const selectAllKey = platform() === "darwin" ? "Meta+a" : "Control+a";
978
+ await this.pressKey(selectAllKey);
979
+ await new Promise((r) => setTimeout(r, 50));
980
+ await this.pressKey("Backspace");
981
+ await new Promise((r) => setTimeout(r, 50));
914
982
 
915
- // Delete selected text
916
- await this.send("Input.dispatchKeyEvent", {
917
- type: "keyDown",
918
- key: "Backspace",
919
- code: "Backspace",
920
- windowsVirtualKeyCode: 8,
921
- });
922
- await this.send("Input.dispatchKeyEvent", {
923
- type: "keyUp",
924
- key: "Backspace",
925
- code: "Backspace",
983
+ // 2. Verify the field is empty; if not, fall back to JS-based clearing
984
+ const cleared = await this.send("Runtime.evaluate", {
985
+ expression: `
986
+ (function() {
987
+ var el = document.querySelector('[data-assistme-ref="${refId}"]');
988
+ if (!el) return 'no_element';
989
+ if (el.value !== undefined && el.value !== '') {
990
+ // Ctrl+A didn't work (some frameworks intercept it) — clear via JS
991
+ var setter = Object.getOwnPropertyDescriptor(
992
+ window.HTMLInputElement.prototype, 'value'
993
+ )?.set || Object.getOwnPropertyDescriptor(
994
+ window.HTMLTextAreaElement.prototype, 'value'
995
+ )?.set;
996
+ if (setter) setter.call(el, '');
997
+ else el.value = '';
998
+ el.dispatchEvent(new Event('input', { bubbles: true }));
999
+ el.dispatchEvent(new Event('change', { bubbles: true }));
1000
+ return 'js_cleared';
1001
+ }
1002
+ return 'ok';
1003
+ })()
1004
+ `,
1005
+ returnByValue: true,
926
1006
  });
1007
+ const clearStatus = ((cleared as CDPEvalResult).result?.value as string) || "ok";
1008
+ if (clearStatus === "no_element") {
1009
+ return {
1010
+ success: false,
1011
+ message: `Ref ${refLabel} not found after click. Take a new snapshot.`,
1012
+ };
1013
+ }
927
1014
 
928
1015
  // Insert text via CDP (goes through the browser's input pipeline)
929
1016
  await this.send("Input.insertText", { text });
930
1017
 
931
1018
  await new Promise((r) => setTimeout(r, 100));
932
- const ref = this.refCache.get(refId);
933
- return `Typed "${text}" into [${refId}] ${ref?.role || ""} "${ref?.name || ""}"`;
1019
+ return { success: true, message: `Typed "${text}" into ${refLabel}` };
934
1020
  }
935
1021
 
936
1022
  /**
@@ -938,21 +1024,22 @@ export class BrowserController {
938
1024
  * ref's data attribute as selector, handling both native <select> and
939
1025
  * custom dropdown components.
940
1026
  */
941
- async selectRef(refId: number, option: string): Promise<string> {
1027
+ async selectRef(refId: number, option: string): Promise<RefActionResult> {
942
1028
  this.ensureConnected();
943
1029
 
944
- // Check if ref exists
945
1030
  const cached = this.refCache.get(refId);
946
1031
  if (!cached) {
947
- return `Ref [${refId}] not found. Take a new snapshot with browser_snapshot.`;
1032
+ return {
1033
+ success: false,
1034
+ message: `Ref [${refId}] not found. Take a new snapshot with browser_snapshot.`,
1035
+ };
948
1036
  }
949
1037
 
950
- // Use the data attribute selector to find the element
1038
+ const refLabel = `[${refId}] ${cached.role} "${cached.name}"`;
951
1039
  const result = await this.selectOption(`[data-assistme-ref="${refId}"]`, option);
952
- return result.replace(
953
- /\[data-assistme-ref="\d+"\]/,
954
- `[${refId}] ${cached.role} "${cached.name}"`
955
- );
1040
+ const message = result.replace(/\[data-assistme-ref="\d+"\]/, refLabel);
1041
+ const success = !result.includes("not found");
1042
+ return { success, message };
956
1043
  }
957
1044
 
958
1045
  // ── Action Pipeline ───────────────────────────────────────────────
@@ -977,18 +1064,24 @@ export class BrowserController {
977
1064
 
978
1065
  try {
979
1066
  switch (spec.action) {
980
- case "click":
981
- result = await this.clickRef(spec.ref);
982
- success = !result.includes("not found");
1067
+ case "click": {
1068
+ const r = await this.clickRef(spec.ref);
1069
+ result = r.message;
1070
+ success = r.success;
983
1071
  break;
984
- case "type":
985
- result = await this.typeRef(spec.ref, spec.text);
986
- success = !result.includes("not found");
1072
+ }
1073
+ case "type": {
1074
+ const r = await this.typeRef(spec.ref, spec.text);
1075
+ result = r.message;
1076
+ success = r.success;
987
1077
  break;
988
- case "select":
989
- result = await this.selectRef(spec.ref, spec.option);
990
- success = !result.includes("not found");
1078
+ }
1079
+ case "select": {
1080
+ const r = await this.selectRef(spec.ref, spec.option);
1081
+ result = r.message;
1082
+ success = r.success;
991
1083
  break;
1084
+ }
992
1085
  case "press":
993
1086
  result = await this.pressKey(spec.key);
994
1087
  break;
@@ -1074,15 +1167,24 @@ export class BrowserController {
1074
1167
  // Strategy 2: Custom dropdown — find the trigger element
1075
1168
  var trigger = selectEl;
1076
1169
  if (!trigger) {
1077
- // Try finding by label/placeholder text
1078
- var allEls = document.querySelectorAll('*');
1079
- for (var j = 0; j < allEls.length; j++) {
1080
- var el = allEls[j];
1170
+ // Try finding by aria-label first (fast, indexed)
1171
+ trigger = document.querySelector('[aria-label="' + sel.replace(/"/g, '\\"') + '"]');
1172
+ }
1173
+ if (!trigger) {
1174
+ // Try finding by label/placeholder text in likely dropdown elements
1175
+ var dropdownCandidates = document.querySelectorAll(
1176
+ 'button, [role="combobox"], [role="listbox"], [role="button"], ' +
1177
+ 'select, input, .MuiSelect-root, .MuiInput-root, ' +
1178
+ '[class*="select"], [class*="dropdown"], [class*="picker"]'
1179
+ );
1180
+ for (var j = 0; j < dropdownCandidates.length; j++) {
1181
+ var el = dropdownCandidates[j];
1081
1182
  var ownText = Array.from(el.childNodes)
1082
1183
  .filter(function(n) { return n.nodeType === 3; })
1083
1184
  .map(function(n) { return n.textContent.trim(); })
1084
1185
  .join('');
1085
- if (ownText === sel || el.getAttribute('aria-label') === sel) {
1186
+ if (ownText === sel || el.getAttribute('aria-label') === sel ||
1187
+ el.getAttribute('placeholder') === sel) {
1086
1188
  trigger = el;
1087
1189
  break;
1088
1190
  }
@@ -1119,10 +1221,13 @@ export class BrowserController {
1119
1221
  }
1120
1222
  }
1121
1223
 
1122
- // Broader search: any visible element with exact text match
1123
- var everything = document.querySelectorAll('*');
1124
- for (var m = 0; m < everything.length; m++) {
1125
- var candidate = everything[m];
1224
+ // Broader search: visible leaf elements in interactive containers
1225
+ var broadCandidates = document.querySelectorAll(
1226
+ 'li, span, div, a, button, label, [role="option"], [role="menuitem"], ' +
1227
+ '[role="menuitemradio"], [role="menuitemcheckbox"], [data-value]'
1228
+ );
1229
+ for (var m = 0; m < broadCandidates.length; m++) {
1230
+ var candidate = broadCandidates[m];
1126
1231
  if (candidate.textContent && candidate.textContent.trim() === optText &&
1127
1232
  candidate.offsetParent !== null && candidate.children.length === 0) {
1128
1233
  candidate.click();
@@ -1217,6 +1322,7 @@ export class BrowserController {
1217
1322
 
1218
1323
  private async waitForLoad(timeoutMs = 8000): Promise<void> {
1219
1324
  const start = Date.now();
1325
+ let sawInteractive = false;
1220
1326
  while (Date.now() - start < timeoutMs) {
1221
1327
  try {
1222
1328
  const result = await this.send("Runtime.evaluate", {
@@ -1224,71 +1330,27 @@ export class BrowserController {
1224
1330
  returnByValue: true,
1225
1331
  });
1226
1332
  const state = (result as CDPEvalResult).result?.value;
1227
- if (state === "complete" || state === "interactive") {
1228
- // Extra small wait for dynamic content
1229
- await new Promise((r) => setTimeout(r, 500));
1333
+ if (state === "complete") {
1334
+ // Fully loaded — brief wait for dynamic content
1335
+ await new Promise((r) => setTimeout(r, 300));
1230
1336
  return;
1231
1337
  }
1338
+ if (state === "interactive") {
1339
+ if (!sawInteractive) {
1340
+ sawInteractive = true;
1341
+ // DOM is ready but sub-resources still loading — give it more
1342
+ // time to reach "complete" before settling for "interactive"
1343
+ }
1344
+ }
1232
1345
  } catch {
1233
1346
  // Tab might be navigating
1234
1347
  }
1235
1348
  await new Promise((r) => setTimeout(r, 300));
1236
1349
  }
1237
- }
1238
-
1239
- /**
1240
- * Find interactive elements on the page for the AI to understand what's clickable
1241
- */
1242
- async getInteractiveElements(): Promise<string> {
1243
- this.ensureConnected();
1244
- const result = await this.send("Runtime.evaluate", {
1245
- expression: `
1246
- (function() {
1247
- const elements = [];
1248
- const selectors = 'a, button, input, select, textarea, [role="button"], [onclick]';
1249
- const all = document.querySelectorAll(selectors);
1250
- for (let i = 0; i < all.length && elements.length < 50; i++) {
1251
- const el = all[i];
1252
- const rect = el.getBoundingClientRect();
1253
- if (rect.width === 0 || rect.height === 0) continue; // Skip hidden
1254
-
1255
- // Build a reliable CSS selector
1256
- let selector;
1257
- if (el.id) {
1258
- selector = '#' + CSS.escape(el.id);
1259
- } else if (el.getAttribute('data-testid')) {
1260
- selector = '[data-testid="' + el.getAttribute('data-testid') + '"]';
1261
- } else {
1262
- // Build a path-based selector: find nth-of-type among siblings
1263
- const tag = el.tagName.toLowerCase();
1264
- const parent = el.parentElement;
1265
- if (parent) {
1266
- const siblings = parent.querySelectorAll(':scope > ' + tag);
1267
- const idx = Array.from(siblings).indexOf(el) + 1;
1268
- selector = tag + ':nth-of-type(' + idx + ')';
1269
- } else {
1270
- selector = tag;
1271
- }
1272
- }
1273
-
1274
- elements.push({
1275
- tag: el.tagName.toLowerCase(),
1276
- text: (el.textContent || '').trim().slice(0, 80),
1277
- type: el.getAttribute('type') || '',
1278
- name: el.getAttribute('name') || '',
1279
- id: el.id || '',
1280
- href: el.getAttribute('href') || '',
1281
- placeholder: el.getAttribute('placeholder') || '',
1282
- selector: selector,
1283
- });
1284
- }
1285
- return JSON.stringify(elements, null, 2);
1286
- })()
1287
- `,
1288
- returnByValue: true,
1289
- });
1290
-
1291
- return ((result as CDPEvalResult).result?.value as string) || "[]";
1350
+ // Timed out — if we at least saw "interactive", that's usually good enough
1351
+ if (sawInteractive) {
1352
+ await new Promise((r) => setTimeout(r, 300));
1353
+ }
1292
1354
  }
1293
1355
 
1294
1356
  isConnected(): boolean {
@@ -62,6 +62,12 @@ export interface ActionResult {
62
62
  success: boolean;
63
63
  }
64
64
 
65
+ /** Structured result from ref-based interactions (click, type, select). */
66
+ export interface RefActionResult {
67
+ success: boolean;
68
+ message: string;
69
+ }
70
+
65
71
  export interface AutoLaunchResult {
66
72
  success: boolean;
67
73
  action: "already_available" | "launched" | "chrome_not_found" | "launch_failed" | "port_conflict";