surfagent 1.0.9 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/API.md CHANGED
@@ -447,6 +447,27 @@ Check if the API can connect to Chrome.
447
447
 
448
448
  ---
449
449
 
450
+ ### POST /type
451
+
452
+ Raw CDP key typing without clearing the field first. Use this for apps like **Google Sheets**, contenteditable elements, or any context where `/fill`'s Ctrl+A clear step causes side effects (e.g., selecting all cells instead of clearing a field).
453
+
454
+ **Request:**
455
+ ```json
456
+ { "tab": "0", "keys": "Hello World", "submit": "tab" }
457
+ ```
458
+
459
+ - `keys` (string) — characters to type via CDP `Input.dispatchKeyEvent`
460
+ - `submit` (optional) — `"enter"` or `"tab"` to press after typing
461
+
462
+ **Response:**
463
+ ```json
464
+ { "typed": 11, "submitted": true }
465
+ ```
466
+
467
+ **Why not `/fill`?** The `/fill` endpoint focuses an element, does Ctrl+A + Backspace to clear it, then types. In Google Sheets, Ctrl+A selects all cells (not text in the current cell), wiping the entire sheet. `/type` skips the focus and clear — it types into whatever currently has focus.
468
+
469
+ ---
470
+
450
471
  ## Tab Targeting
451
472
 
452
473
  All POST endpoints accept a `tab` field. It resolves in this order:
@@ -548,6 +569,193 @@ POST /read { "tab": "0", "selector": "#main-content" }
548
569
  → Click inside the iframe
549
570
  ```
550
571
 
572
+ ### Google Sheets workflow
573
+
574
+ Google Sheets requires a special approach because `/fill` uses Ctrl+A to clear fields, which selects all cells in Sheets. Use the **name box + `/type`** pattern instead.
575
+
576
+ **Navigate to a cell:** Use `/fill` on the name box (`#t-name-box`), then Enter to jump to the cell.
577
+
578
+ **Type into a cell:** Use `/type` with `"submit": "tab"` (moves to next cell) or `"submit": "enter"` (moves down).
579
+
580
+ ```
581
+ 1. POST /click { "tab": "sheets", "selector": "#t-name-box" }
582
+ → Focus the name box
583
+ 2. POST /fill { "tab": "sheets", "fields": [{ "selector": "#t-name-box", "value": "A1", "clear": true }], "submit": "enter" }
584
+ → Navigate to cell A1
585
+ 3. POST /type { "tab": "sheets", "keys": "Hello World", "submit": "tab" }
586
+ → Type into A1, Tab moves to B1
587
+ 4. POST /type { "tab": "sheets", "keys": "=SUM(A1:A10)", "submit": "enter" }
588
+ → Type a formula, Enter commits and moves down
589
+ ```
590
+
591
+ **Adding a new sheet tab:** The "+" button at the bottom does not respond to DOM `.click()`. Use CDP `Input.dispatchMouseEvent` at the button's coordinates:
592
+
593
+ ```bash
594
+ # Get the Add Sheet button position
595
+ curl -s -X POST localhost:3456/eval -d '{"tab":"0","expression":"var els = document.querySelectorAll(\"div[data-tooltip]\"); var r = \"\"; for(var i=0;i<els.length;i++){if(els[i].dataset.tooltip===\"Add Sheet\"){var b=els[i].getBoundingClientRect(); r=b.x+\",\"+b.y+\",\"+b.width+\",\"+b.height}} r"}'
596
+ # Returns: "44,854,34,34"
597
+
598
+ # Click it with a Node script using CDP mouse events
599
+ node -e "
600
+ const CDP = require('chrome-remote-interface');
601
+ (async () => {
602
+ const targets = await CDP.List({port: 9222});
603
+ const tab = targets.find(t => t.url.includes('docs.google.com'));
604
+ const client = await CDP({target: tab, port: 9222});
605
+ await client.Input.dispatchMouseEvent({type:'mousePressed', x:61, y:871, button:'left', clickCount:1});
606
+ await client.Input.dispatchMouseEvent({type:'mouseReleased', x:61, y:871, button:'left', clickCount:1});
607
+ await client.close();
608
+ })();
609
+ "
610
+ ```
611
+
612
+ **Renaming a sheet tab:** Double-click the tab name via CDP mouse events at the tab's coordinates, then use `/type` to enter the new name and press Enter.
613
+
614
+ **Using the menu search:** Google Sheets has a menu search box (`input[aria-label="Menus"]` or `input[aria-label="Menus (Option+/)"]`). Use `/fill` to type a command (e.g., "Insert chart"), then `/click` on the matching result.
615
+
616
+ **Key gotchas:**
617
+ - Never use `/fill` directly on Google Sheets cells — it will wipe data via Ctrl+A
618
+ - Always navigate to a cell via the name box first, then `/type`
619
+ - Some buttons (Add Sheet, menu items) only respond to CDP mouse events, not DOM clicks
620
+ - Navigating away from unsaved Sheets triggers a native Chrome dialog — see the "Native Chrome Dialogs" section below
621
+
622
+ ### CDP mouse clicks for unreachable elements
623
+
624
+ Some UI elements don't respond to JavaScript `.click()` or the `/click` endpoint — they only react to real mouse events at their coordinates. This is common for:
625
+ - Google Sheets buttons (Add Sheet, toolbar items)
626
+ - Canvas-rendered elements
627
+ - Custom widgets that listen for `mousedown`/`mouseup` events
628
+
629
+ **Pattern:**
630
+ ```bash
631
+ # 1. Get the element's coordinates via /eval
632
+ curl -s -X POST localhost:3456/eval -d '{"tab":"0","expression":"document.querySelector(\"#my-button\").getBoundingClientRect().x"}'
633
+ # → {"result": 100}
634
+
635
+ # 2. Click via CDP Input.dispatchMouseEvent (requires a Node script)
636
+ node -e "
637
+ const CDP = require('chrome-remote-interface');
638
+ (async () => {
639
+ const targets = await CDP.List({port: 9222});
640
+ const tab = targets.find(t => t.url.includes('your-site'));
641
+ const client = await CDP({target: tab, port: 9222});
642
+ await client.Input.dispatchMouseEvent({type:'mousePressed', x:100, y:200, button:'left', clickCount:1});
643
+ await client.Input.dispatchMouseEvent({type:'mouseReleased', x:100, y:200, button:'left', clickCount:1});
644
+ await client.close();
645
+ })();
646
+ "
647
+ ```
648
+
649
+ **Double-click** (e.g., to rename a Google Sheets tab): use `clickCount: 2`.
650
+
651
+ ---
652
+
653
+ ## Native Chrome Dialogs (Not in DOM or CDP)
654
+
655
+ Chrome can show browser-level popups — like "Leave page?" (`beforeunload`) dialogs — that are **not in the DOM**, **not accessible via CDP**, and will **block all CDP commands** (`/eval`, `/recon`, `/read` will all hang or timeout).
656
+
657
+ **Symptoms of a stuck session:**
658
+ - API calls hang or timeout on a tab that was previously working
659
+ - `Page.handleJavaScriptDialog` returns "No dialog is showing" (because it's not a JS dialog — it's a native Chrome window)
660
+ - The agent appears frozen on a page
661
+
662
+ **How to detect it (macOS only):**
663
+
664
+ Use CoreGraphics to list windows belonging to the surfagent Chrome process. Native dialogs appear as small unnamed windows (~260x218px) that are not visible to CDP.
665
+
666
+ ```bash
667
+ # 1. Find the surfagent Chrome PID
668
+ SURFAGENT_PID=$(ps aux | grep 'chrome.*surfagent' | grep -v grep | awk '{print $2}')
669
+
670
+ # 2. List all windows for that PID using CoreGraphics
671
+ swift -e "
672
+ import CoreGraphics
673
+ let windows = CGWindowListCopyWindowInfo(.optionAll, kCGNullWindowID) as! [[String: Any]]
674
+ for w in windows {
675
+ let pid = w[\"kCGWindowOwnerPID\"] as? Int ?? 0
676
+ if pid == ${SURFAGENT_PID} {
677
+ let name = w[\"kCGWindowName\"] as? String ?? \"(unnamed)\"
678
+ let bounds = w[\"kCGWindowBounds\"] as? [String: Any] ?? [:]
679
+ let width = bounds[\"Width\"] as? Int ?? 0
680
+ let height = bounds[\"Height\"] as? Int ?? 0
681
+ if width > 100 && height > 100 {
682
+ print(\"Window: \(name) | Size: \(width)x\(height)\")
683
+ }
684
+ }
685
+ }
686
+ "
687
+ ```
688
+
689
+ **What to look for:** A small unnamed window (typically ~260x218) alongside the main browser window. That's the native dialog.
690
+
691
+ **How to dismiss it:**
692
+
693
+ Native Chrome dialogs cannot be dismissed via CDP or AppleScript's `tell process "Google Chrome"` (which only sees the personal Chrome, not the surfagent debug instance). You must use the **Swift Accessibility API targeting the surfagent PID directly**.
694
+
695
+ ```bash
696
+ # Find the surfagent Chrome PID
697
+ SURFAGENT_PID=$(ps aux | grep 'chrome.*surfagent' | grep -v grep | awk '{print $2}')
698
+
699
+ # Click "Cancel" (stay on page) — or change "Avbryt"/"Cancel" to "Leave"/"Gå ut" to leave
700
+ swift -e "
701
+ import Cocoa
702
+
703
+ let pid: pid_t = ${SURFAGENT_PID}
704
+ let app = AXUIElementCreateApplication(pid)
705
+
706
+ var windowsRef: CFTypeRef?
707
+ AXUIElementCopyAttributeValue(app, \"AXWindows\" as CFString, &windowsRef)
708
+
709
+ if let windows = windowsRef as? [AXUIElement] {
710
+ for win in windows {
711
+ var subroleRef: CFTypeRef?
712
+ AXUIElementCopyAttributeValue(win, \"AXSubrole\" as CFString, &subroleRef)
713
+ let subrole = subroleRef as? String ?? \"\"
714
+
715
+ // Native dialogs have subrole AXDialog
716
+ if subrole == \"AXDialog\" {
717
+ var childrenRef: CFTypeRef?
718
+ AXUIElementCopyAttributeValue(win, \"AXChildren\" as CFString, &childrenRef)
719
+ if let children = childrenRef as? [AXUIElement] {
720
+ for child in children {
721
+ var roleRef: CFTypeRef?
722
+ AXUIElementCopyAttributeValue(child, \"AXRole\" as CFString, &roleRef)
723
+ var titleRef: CFTypeRef?
724
+ AXUIElementCopyAttributeValue(child, \"AXTitle\" as CFString, &titleRef)
725
+ let role = roleRef as? String ?? \"\"
726
+ let title = titleRef as? String ?? \"\"
727
+
728
+ // Match button by title — handles multiple languages
729
+ // Cancel/Stay: \"Cancel\", \"Avbryt\" (Norwegian)
730
+ // Leave: \"Leave\", \"Gå ut\" (Norwegian)
731
+ let cancelNames = [\"Cancel\", \"Avbryt\"]
732
+ let leaveNames = [\"Leave\", \"Gå ut\"]
733
+
734
+ let targetNames = cancelNames // Change to leaveNames to leave
735
+
736
+ if role == \"AXButton\" && targetNames.contains(title) {
737
+ let result = AXUIElementPerformAction(child, \"AXPress\" as CFString)
738
+ print(\"Clicked \(title): \(result == .success ? \"SUCCESS\" : \"FAILED\")\")
739
+ }
740
+ }
741
+ }
742
+ }
743
+ }
744
+ }
745
+ "
746
+ ```
747
+
748
+ **Why AppleScript doesn't work:** `tell process "Google Chrome"` sees all Chrome instances as one process, but only exposes the *personal* Chrome's windows. The surfagent Chrome (launched with `--user-data-dir=/tmp/surfagent-chrome`) is invisible to it. The Swift `AXUIElementCreateApplication(pid)` approach targets the exact process by PID, which is the only way to reach the surfagent Chrome's native dialogs.
749
+
750
+ **When to check:** If any API call hangs or times out unexpectedly on a tab that was previously responsive, check for a native dialog before retrying. Common triggers:
751
+ - Navigating away from pages with unsaved changes (Google Sheets, web editors, forms)
752
+ - `window.onbeforeunload` handlers
753
+ - Chrome permission prompts
754
+
755
+ **Decision logic for agents:**
756
+ - **Click "Leave"** if you intentionally navigated away and don't need the page anymore
757
+ - **Click "Cancel"** if the navigation was accidental and you want to keep working on the current page (e.g., Google Sheets with unsaved data)
758
+
551
759
  ---
552
760
 
553
761
  ## Important Notes
package/CLAUDE.md CHANGED
@@ -76,6 +76,9 @@ curl -X POST localhost:3456/eval -H 'Content-Type: application/json' -d '{"tab":
76
76
  # Bring tab to front
77
77
  curl -X POST localhost:3456/focus -H 'Content-Type: application/json' -d '{"tab":"0"}'
78
78
 
79
+ # Raw key typing — no clear step, for Google Sheets / contenteditable / canvas
80
+ curl -X POST localhost:3456/type -H 'Content-Type: application/json' -d '{"tab":"0","keys":"Hello","submit":"tab"}'
81
+
79
82
  # Captcha detection and interaction (experimental)
80
83
  curl -X POST localhost:3456/captcha -H 'Content-Type: application/json' -d '{"tab":"0","action":"detect"}'
81
84
 
@@ -86,6 +89,25 @@ curl localhost:3456/tabs
86
89
  curl localhost:3456/health
87
90
  ```
88
91
 
92
+ ### Google Sheets
93
+
94
+ Google Sheets requires `/type` instead of `/fill` for cell input (because `/fill` does Ctrl+A which selects all cells). Use the name box to navigate, then `/type` to enter data:
95
+
96
+ ```bash
97
+ # 1. Click the name box
98
+ curl -X POST localhost:3456/click -H 'Content-Type: application/json' -d '{"tab":"sheets","selector":"#t-name-box"}'
99
+
100
+ # 2. Navigate to a cell
101
+ curl -X POST localhost:3456/fill -H 'Content-Type: application/json' -d '{"tab":"sheets","fields":[{"selector":"#t-name-box","value":"A1","clear":true}],"submit":"enter"}'
102
+
103
+ # 3. Type into the cell (Tab moves right, Enter moves down)
104
+ curl -X POST localhost:3456/type -H 'Content-Type: application/json' -d '{"tab":"sheets","keys":"=SUM(B2:B10)","submit":"tab"}'
105
+ ```
106
+
107
+ Some Sheets buttons (Add Sheet +, toolbar) only respond to CDP mouse events, not DOM clicks. See `API.md` for the CDP mouse click pattern.
108
+
109
+ **Warning:** Navigating away from unsaved Sheets triggers a native Chrome "Leave page?" dialog that blocks ALL CDP commands. See `API.md` > "Native Chrome Dialogs" for detection and dismissal via Swift AX API.
110
+
89
111
  ### Tab Targeting
90
112
 
91
113
  All endpoints accept a `tab` field:
package/README.md CHANGED
@@ -74,6 +74,7 @@ curl -X POST localhost:3456/read -H 'Content-Type: application/json' \
74
74
  | `/navigate` | POST | Go to URL, back, or forward in the same tab |
75
75
  | `/eval` | POST | Run JavaScript in any tab or cross-origin iframe |
76
76
  | `/captcha` | POST | Detect and interact with captchas — Arkose, reCAPTCHA, hCaptcha (experimental) |
77
+ | `/type` | POST | Raw CDP key typing without clearing — for Google Sheets, contenteditable, canvas apps |
77
78
  | `/focus` | POST | Bring a tab to the front in Chrome |
78
79
  | `/tabs` | GET | List all open Chrome tabs |
79
80
  | `/health` | GET | Check if Chrome and API are connected |
package/dist/api/act.d.ts CHANGED
@@ -91,3 +91,11 @@ export declare function focusTab(tabPattern: string, options: {
91
91
  title: string;
92
92
  url: string;
93
93
  }>;
94
+ export declare function typeKeys(tabPattern: string, keys: string, options: {
95
+ port?: number;
96
+ host?: string;
97
+ submit?: string;
98
+ }): Promise<{
99
+ typed: number;
100
+ submitted?: boolean;
101
+ }>;
package/dist/api/act.js CHANGED
@@ -37,43 +37,80 @@ export async function fillFields(request, options) {
37
37
  const results = [];
38
38
  for (const field of request.fields) {
39
39
  try {
40
- // Focus the element and clear it
41
- await client.Runtime.evaluate({
42
- expression: `
43
- (function() {
44
- const el = document.querySelector(${JSON.stringify(field.selector)});
45
- if (!el) throw new Error('Element not found: ${field.selector}');
46
- el.focus();
47
- el.click();
48
- // Select all existing content so typing replaces it
49
- if (el.select) el.select();
50
- else if (el.setSelectionRange) el.setSelectionRange(0, el.value?.length || 0);
51
- })()
52
- `,
40
+ // Detect element type to choose fill strategy
41
+ const elInfo = await client.Runtime.evaluate({
42
+ expression: `(function() {
43
+ const el = document.querySelector(${JSON.stringify(field.selector)});
44
+ if (!el) return { found: false };
45
+ return {
46
+ found: true,
47
+ tag: el.tagName,
48
+ type: el.type || null,
49
+ contentEditable: el.isContentEditable || false,
50
+ maxLength: el.maxLength >= 0 ? el.maxLength : null
51
+ };
52
+ })()`,
53
53
  returnByValue: true
54
54
  });
55
- // Clear existing value with select-all + delete
56
- await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: 'a', code: 'KeyA', modifiers: 2 }); // Ctrl+A / Cmd+A
57
- await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: 'a', code: 'KeyA', modifiers: 2 });
58
- await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: 'Backspace', code: 'Backspace' });
59
- await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: 'Backspace', code: 'Backspace' });
60
- // Type each character via CDP Input.dispatchKeyEvent
61
- for (const char of field.value) {
62
- await cdp.Input.dispatchKeyEvent({
63
- type: 'keyDown',
64
- key: char,
65
- text: char,
55
+ const info = elInfo.result.value;
56
+ if (!info || !info.found) {
57
+ results.push({ selector: field.selector, success: false, error: `Element not found: ${field.selector}` });
58
+ continue;
59
+ }
60
+ const isDateTimeRange = ['date', 'time', 'datetime-local', 'month', 'week', 'range', 'color'].includes(info.type);
61
+ const isContentEditable = info.contentEditable && info.tag !== 'INPUT' && info.tag !== 'TEXTAREA';
62
+ if (isDateTimeRange) {
63
+ // Date/time/range inputs: set value programmatically + dispatch events
64
+ await client.Runtime.evaluate({
65
+ expression: `(function() {
66
+ const el = document.querySelector(${JSON.stringify(field.selector)});
67
+ const nativeSetter = Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, 'value').set;
68
+ nativeSetter.call(el, ${JSON.stringify(field.value)});
69
+ el.dispatchEvent(new Event('input', { bubbles: true }));
70
+ el.dispatchEvent(new Event('change', { bubbles: true }));
71
+ })()`,
72
+ returnByValue: true
66
73
  });
67
- await cdp.Input.dispatchKeyEvent({
68
- type: 'keyUp',
69
- key: char,
74
+ }
75
+ else {
76
+ // Focus and clear
77
+ await client.Runtime.evaluate({
78
+ expression: `
79
+ (function() {
80
+ const el = document.querySelector(${JSON.stringify(field.selector)});
81
+ el.focus();
82
+ el.click();
83
+ if (el.select) el.select();
84
+ else if (el.setSelectionRange) el.setSelectionRange(0, el.value?.length || 0);
85
+ })()
86
+ `,
87
+ returnByValue: true
70
88
  });
89
+ // Clear existing value with select-all + delete
90
+ await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: 'a', code: 'KeyA', modifiers: 2 });
91
+ await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: 'a', code: 'KeyA', modifiers: 2 });
92
+ await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: 'Backspace', code: 'Backspace' });
93
+ await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: 'Backspace', code: 'Backspace' });
94
+ // Type each character via CDP Input.dispatchKeyEvent
95
+ for (const char of field.value) {
96
+ if (char === '\n') {
97
+ await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: 'Enter', code: 'Enter', text: '\r', windowsVirtualKeyCode: 13, nativeVirtualKeyCode: 13 });
98
+ await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: 'Enter', code: 'Enter', windowsVirtualKeyCode: 13, nativeVirtualKeyCode: 13 });
99
+ }
100
+ else if (char === '\t') {
101
+ await client.Runtime.evaluate({ expression: `document.execCommand('insertText', false, '\\t')` });
102
+ }
103
+ else {
104
+ await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: char, text: char });
105
+ await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: char });
106
+ }
107
+ }
71
108
  }
72
- // Verify the value was set
73
- const verify = await client.Runtime.evaluate({
74
- expression: `document.querySelector(${JSON.stringify(field.selector)})?.value`,
75
- returnByValue: true
76
- });
109
+ // Verify: use value for inputs, textContent for contenteditable
110
+ const verifyExpr = isContentEditable
111
+ ? `document.querySelector(${JSON.stringify(field.selector)})?.textContent?.trim()`
112
+ : `document.querySelector(${JSON.stringify(field.selector)})?.value`;
113
+ const verify = await client.Runtime.evaluate({ expression: verifyExpr, returnByValue: true });
77
114
  const actual = verify.result.value;
78
115
  if (actual === field.value) {
79
116
  results.push({ selector: field.selector, success: true });
@@ -81,6 +118,13 @@ export async function fillFields(request, options) {
81
118
  else if (actual === undefined || actual === null) {
82
119
  results.push({ selector: field.selector, success: false, error: `Element not found or has no value: ${field.selector}` });
83
120
  }
121
+ else if (info.maxLength && actual === field.value.substring(0, info.maxLength)) {
122
+ // Maxlength truncation — fill worked within constraint
123
+ results.push({ selector: field.selector, success: true, error: `Truncated to maxlength=${info.maxLength}` });
124
+ }
125
+ else if (isContentEditable && actual.includes(field.value)) {
126
+ results.push({ selector: field.selector, success: true });
127
+ }
84
128
  else {
85
129
  results.push({ selector: field.selector, success: false, error: `Value mismatch: expected "${field.value}", got "${actual}"` });
86
130
  }
@@ -138,13 +182,25 @@ export async function clickElement(request, options) {
138
182
  }
139
183
  if (!el && text) {
140
184
  const lower = text.toLowerCase();
141
- const all = document.querySelectorAll('a, button, input[type="submit"], [role="button"], [role="option"], [role="menuitem"], [role="listitem"], [role="tab"], [role="link"], li[aria-label], [onclick]');
185
+ const all = document.querySelectorAll('a, button, input[type="submit"], [role="button"], [role="option"], [role="menuitem"], [role="listitem"], [role="tab"], [role="link"], li[aria-label], [onclick], label');
186
+ let bestMatch = null;
187
+ let bestScore = Infinity; // lower is better
142
188
  for (const candidate of all) {
143
189
  const t = (candidate.innerText || candidate.textContent || candidate.value || candidate.getAttribute('aria-label') || '').trim();
144
- if (t.toLowerCase().includes(lower)) { el = candidate; break; }
190
+ const tLower = t.toLowerCase();
191
+ if (!tLower.includes(lower)) continue;
192
+ // Score: 0 = exact, 1 = starts-with, 2+ = contains (shorter text = better)
193
+ let score;
194
+ if (tLower === lower) score = 0;
195
+ else if (tLower.startsWith(lower)) score = 1;
196
+ else score = 2 + t.length;
197
+ if (score < bestScore) { bestMatch = candidate; bestScore = score; }
198
+ if (score === 0) break; // exact match, stop
145
199
  }
200
+ el = bestMatch;
146
201
  }
147
202
  if (!el) return { success: false, error: 'Element not found' };
203
+ if (el.disabled || el.getAttribute('aria-disabled') === 'true') return { success: false, error: 'Element is disabled' };
148
204
 
149
205
  el.scrollIntoView({ block: 'center' });
150
206
 
@@ -192,17 +248,39 @@ export async function scrollPage(request, options) {
192
248
  const scrollY = Math.round(window.scrollY);
193
249
  const scrollHeight = document.documentElement.scrollHeight;
194
250
  const viewportHeight = window.innerHeight;
195
- const atBottom = (scrollY + viewportHeight) >= (scrollHeight - 10);
251
+ const atBottom = (scrollY + viewportHeight) >= (scrollHeight - 2);
196
252
 
197
- // Get visible text content
198
- const centerY = scrollY + viewportHeight / 2;
199
- const elements = document.elementsFromPoint(window.innerWidth / 2, viewportHeight / 2);
253
+ // Get visible text content from elements in the current viewport
200
254
  let contentPreview = '';
201
- for (const el of elements) {
255
+ const visibleTexts = [];
256
+ const mainEl = document.querySelector('main, article, [role="main"]') || document.body;
257
+ const allEls = mainEl.querySelectorAll('p, li, td, th, h1, h2, h3, h4, h5, h6, dd, dt, blockquote, pre');
258
+ for (const el of allEls) {
259
+ if (visibleTexts.length >= 30) break;
260
+ const rect = el.getBoundingClientRect();
261
+ // Element must be within the viewport
262
+ if (rect.bottom < 0 || rect.top > viewportHeight || rect.height === 0) continue;
263
+ // Skip fixed/sticky elements (nav, TOC, sidebars)
264
+ const style = window.getComputedStyle(el.closest('nav, aside, [role="navigation"]') || el);
265
+ if (style.position === 'fixed' || style.position === 'sticky') continue;
202
266
  const text = el.innerText?.trim();
203
- if (text && text.length > 50) {
204
- contentPreview = text.substring(0, 1500);
205
- break;
267
+ if (!text || text.length < 5) continue;
268
+ // Skip if text is too long (likely a parent container)
269
+ if (text.length > 500) continue;
270
+ // Skip duplicates
271
+ if (visibleTexts.some(t => t.includes(text) || text.includes(t))) continue;
272
+ visibleTexts.push(text);
273
+ }
274
+ contentPreview = visibleTexts.join('\\n').substring(0, 1500);
275
+ if (!contentPreview) {
276
+ // Fallback: grab from center point
277
+ const elements = document.elementsFromPoint(window.innerWidth / 2, viewportHeight / 2);
278
+ for (const el of elements) {
279
+ const text = el.innerText?.trim();
280
+ if (text && text.length > 50 && text.length < 3000) {
281
+ contentPreview = text.substring(0, 1500);
282
+ break;
283
+ }
206
284
  }
207
285
  }
208
286
 
@@ -237,8 +315,16 @@ export async function navigatePage(request, options) {
237
315
  await new Promise(resolve => setTimeout(resolve, waitMs));
238
316
  }
239
317
  else if (request.url) {
318
+ // Block dangerous URL schemes
319
+ const scheme = request.url.trim().toLowerCase().split(':')[0];
320
+ if (['javascript', 'vbscript'].includes(scheme)) {
321
+ await client.close();
322
+ throw new Error('Blocked: javascript: URLs are not allowed');
323
+ }
240
324
  await client.Page.navigate({ url: request.url });
241
- await client.Page.loadEventFired();
325
+ // Race loadEventFired against a timeout to prevent hanging on non-loading URLs
326
+ const loadTimeout = new Promise(resolve => setTimeout(resolve, Math.min(waitMs + 10000, 30000)));
327
+ await Promise.race([client.Page.loadEventFired(), loadTimeout]);
242
328
  await new Promise(resolve => setTimeout(resolve, waitMs));
243
329
  }
244
330
  const result = await client.Runtime.evaluate({
@@ -262,9 +348,18 @@ export async function evalInTab(tab, expression, options) {
262
348
  const timeout = new Promise((_, reject) => setTimeout(() => reject(new Error('Eval timed out after 30s')), 30000));
263
349
  const evalPromise = client.Runtime.evaluate({
264
350
  expression,
265
- returnByValue: true
351
+ returnByValue: true,
352
+ awaitPromise: true
266
353
  });
267
354
  const result = await Promise.race([evalPromise, timeout]);
355
+ // Check for exceptions (syntax errors, thrown errors, etc.)
356
+ if (result.exceptionDetails) {
357
+ const desc = result.exceptionDetails.exception?.description
358
+ || result.exceptionDetails.text
359
+ || 'Unknown error';
360
+ await client.close();
361
+ return { __error: desc };
362
+ }
268
363
  await client.close();
269
364
  return result.result.value ?? null;
270
365
  }
@@ -579,3 +674,36 @@ export async function focusTab(tabPattern, options) {
579
674
  throw error;
580
675
  }
581
676
  }
677
+ // Raw CDP key typing — no clear step, no element focus. Types directly into whatever has focus.
678
+ // Designed for apps like Google Sheets where Ctrl+A/Backspace clear causes side effects.
679
+ export async function typeKeys(tabPattern, keys, options) {
680
+ const port = options.port || 9222;
681
+ const host = options.host || 'localhost';
682
+ const tab = await resolveTab(tabPattern, port, host);
683
+ const client = await connectToTab(tab.id, port, host);
684
+ const cdp = client;
685
+ try {
686
+ // Type each character via CDP Input.dispatchKeyEvent
687
+ for (const char of keys) {
688
+ await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: char, text: char });
689
+ await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: char });
690
+ }
691
+ let submitted = false;
692
+ if (options.submit === 'enter') {
693
+ await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: 'Enter', code: 'Enter', windowsVirtualKeyCode: 13, nativeVirtualKeyCode: 13 });
694
+ await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: 'Enter', code: 'Enter', windowsVirtualKeyCode: 13, nativeVirtualKeyCode: 13 });
695
+ submitted = true;
696
+ }
697
+ else if (options.submit === 'tab') {
698
+ await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: 'Tab', code: 'Tab', windowsVirtualKeyCode: 9, nativeVirtualKeyCode: 9 });
699
+ await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: 'Tab', code: 'Tab', windowsVirtualKeyCode: 9, nativeVirtualKeyCode: 9 });
700
+ submitted = true;
701
+ }
702
+ await client.close();
703
+ return { typed: keys.length, submitted };
704
+ }
705
+ catch (error) {
706
+ await client.close();
707
+ throw error;
708
+ }
709
+ }
@@ -28,7 +28,9 @@ export interface ReconResult {
28
28
  role: string | null;
29
29
  x: number;
30
30
  y: number;
31
+ data?: Record<string, string>;
31
32
  }[];
33
+ totalElements: number;
32
34
  forms: {
33
35
  action: string | null;
34
36
  method: string | null;
package/dist/api/recon.js CHANGED
@@ -30,7 +30,14 @@ const EXTRACTION_SCRIPT = `
30
30
  const tag = el.tagName.toLowerCase();
31
31
  if (el.getAttribute('aria-label')) return tag + '[aria-label="' + el.getAttribute('aria-label') + '"]';
32
32
  if (el.getAttribute('data-testid')) return '[data-testid="' + el.getAttribute('data-testid') + '"]';
33
- if (el.getAttribute('name')) return tag + '[name="' + el.getAttribute('name') + '"]';
33
+ if (el.getAttribute('name')) {
34
+ const nameSelector = tag + '[name="' + el.getAttribute('name') + '"]';
35
+ // Disambiguate radio/checkbox with same name by adding value
36
+ if ((el.type === 'radio' || el.type === 'checkbox') && el.value) {
37
+ return nameSelector + '[value="' + el.value + '"]';
38
+ }
39
+ return nameSelector;
40
+ }
34
41
  // Positional fallback
35
42
  const parent = el.parentElement;
36
43
  if (!parent) return tag;
@@ -106,7 +113,12 @@ const EXTRACTION_SCRIPT = `
106
113
  selector: buildSelector(el),
107
114
  role: el.getAttribute('role'),
108
115
  x: Math.round(rect.x),
109
- y: Math.round(rect.y)
116
+ y: Math.round(rect.y),
117
+ ...(el.dataset && Object.keys(el.dataset).length > 0 ? {
118
+ data: Object.fromEntries(
119
+ ['date','iso','value','testid','id'].filter(k => el.dataset[k]).map(k => [k, el.dataset[k]])
120
+ )
121
+ } : {})
110
122
  });
111
123
  }
112
124
  }
@@ -208,6 +220,16 @@ const EXTRACTION_SCRIPT = `
208
220
  }
209
221
  }
210
222
  }
223
+ // Deduplicate: remove overlays that are descendants of other overlays
224
+ const deduped = overlays.filter((o, i) => {
225
+ const el = document.querySelector(o.selector);
226
+ if (!el) return true;
227
+ return !overlays.some((other, j) => {
228
+ if (i === j) return false;
229
+ const otherEl = document.querySelector(other.selector);
230
+ return otherEl && otherEl !== el && otherEl.contains(el);
231
+ });
232
+ });
211
233
 
212
234
  // ---- Captcha detection ----
213
235
  const captchas = [];
@@ -238,9 +260,10 @@ const EXTRACTION_SCRIPT = `
238
260
  headings,
239
261
  navigation: navigation.slice(0, 50),
240
262
  elements: elements.slice(0, 150),
263
+ totalElements: elements.length,
241
264
  forms,
242
265
  landmarks,
243
- overlays,
266
+ overlays: deduped,
244
267
  captchas,
245
268
  contentSummary
246
269
  };
@@ -291,6 +314,7 @@ export async function reconUrl(url, options) {
291
314
  headings: data.headings,
292
315
  navigation: data.navigation,
293
316
  elements: data.elements,
317
+ totalElements: data.totalElements || data.elements?.length || 0,
294
318
  forms: data.forms,
295
319
  contentSummary: data.contentSummary,
296
320
  landmarks: data.landmarks,
@@ -348,6 +372,7 @@ export async function reconTab(tabPattern, options) {
348
372
  headings: data.headings,
349
373
  navigation: data.navigation,
350
374
  elements: data.elements,
375
+ totalElements: data.totalElements || data.elements?.length || 0,
351
376
  forms: data.forms,
352
377
  contentSummary: data.contentSummary,
353
378
  landmarks: data.landmarks,
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
  import http from 'node:http';
3
3
  import { reconUrl, reconTab } from './recon.js';
4
- import { fillFields, clickElement, scrollPage, navigatePage, evalInTab, focusTab, readPage, captchaInteract, dismissOverlays } from './act.js';
4
+ import { fillFields, clickElement, scrollPage, navigatePage, evalInTab, focusTab, readPage, captchaInteract, dismissOverlays, typeKeys } from './act.js';
5
5
  import { getAllTabs } from '../chrome/tabs.js';
6
6
  const PORT = parseInt(process.env.API_PORT || '3456', 10);
7
7
  const CDP_PORT = parseInt(process.env.CDP_PORT || '9222', 10);
@@ -136,14 +136,32 @@ const server = http.createServer(async (req, res) => {
136
136
  return json(res, 400, { error: 'Provide "tab" and "expression"' });
137
137
  }
138
138
  const result = await evalInTab(body.tab, body.expression, { port: CDP_PORT, host: CDP_HOST });
139
+ if (result && result.__error) {
140
+ return json(res, 200, { result: null, error: result.__error });
141
+ }
139
142
  return json(res, 200, { result });
140
143
  }
144
+ // POST /type — raw CDP key typing, no clear step (for Google Sheets, contenteditable, etc.)
145
+ if (path === '/type' && req.method === 'POST') {
146
+ const body = parseBody(await readBody(req));
147
+ if (!body.tab || !body.keys) {
148
+ return json(res, 400, { error: 'Provide "tab" and "keys" (string to type), optional "submit": "enter"|"tab"' });
149
+ }
150
+ const result = await typeKeys(body.tab, body.keys, { port: CDP_PORT, host: CDP_HOST, submit: body.submit });
151
+ return json(res, 200, result);
152
+ }
141
153
  // POST /navigate — go to url, back, or forward in same tab
142
154
  if (path === '/navigate' && req.method === 'POST') {
143
155
  const body = parseBody(await readBody(req));
144
156
  if (!body.tab) {
145
157
  return json(res, 400, { error: 'Provide "tab" and one of: "url", "back":true, "forward":true' });
146
158
  }
159
+ if (!body.url && !body.back && !body.forward) {
160
+ return json(res, 400, { error: 'Provide one of: "url", "back":true, "forward":true' });
161
+ }
162
+ if ((body.url && body.back) || (body.url && body.forward) || (body.back && body.forward)) {
163
+ return json(res, 400, { error: 'Provide only one of: "url", "back", "forward"' });
164
+ }
147
165
  const result = await navigatePage(body, { port: CDP_PORT, host: CDP_HOST });
148
166
  return json(res, 200, result);
149
167
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "surfagent",
3
- "version": "1.0.9",
3
+ "version": "1.1.1",
4
4
  "description": "Browser automation API for AI agents — structured page recon, form filling, clicking, and navigation via Chrome CDP",
5
5
  "keywords": [
6
6
  "ai-agent",
package/src/api/act.ts CHANGED
@@ -60,52 +60,93 @@ export async function fillFields(
60
60
 
61
61
  for (const field of request.fields) {
62
62
  try {
63
- // Focus the element and clear it
64
- await client.Runtime.evaluate({
65
- expression: `
66
- (function() {
67
- const el = document.querySelector(${JSON.stringify(field.selector)});
68
- if (!el) throw new Error('Element not found: ${field.selector}');
69
- el.focus();
70
- el.click();
71
- // Select all existing content so typing replaces it
72
- if (el.select) el.select();
73
- else if (el.setSelectionRange) el.setSelectionRange(0, el.value?.length || 0);
74
- })()
75
- `,
63
+ // Detect element type to choose fill strategy
64
+ const elInfo = await client.Runtime.evaluate({
65
+ expression: `(function() {
66
+ const el = document.querySelector(${JSON.stringify(field.selector)});
67
+ if (!el) return { found: false };
68
+ return {
69
+ found: true,
70
+ tag: el.tagName,
71
+ type: el.type || null,
72
+ contentEditable: el.isContentEditable || false,
73
+ maxLength: el.maxLength >= 0 ? el.maxLength : null
74
+ };
75
+ })()`,
76
76
  returnByValue: true
77
77
  });
78
+ const info = elInfo.result.value as any;
79
+ if (!info || !info.found) {
80
+ results.push({ selector: field.selector, success: false, error: `Element not found: ${field.selector}` });
81
+ continue;
82
+ }
83
+
84
+ const isDateTimeRange = ['date', 'time', 'datetime-local', 'month', 'week', 'range', 'color'].includes(info.type);
85
+ const isContentEditable = info.contentEditable && info.tag !== 'INPUT' && info.tag !== 'TEXTAREA';
78
86
 
79
- // Clear existing value with select-all + delete
80
- await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: 'a', code: 'KeyA', modifiers: 2 }); // Ctrl+A / Cmd+A
81
- await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: 'a', code: 'KeyA', modifiers: 2 });
82
- await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: 'Backspace', code: 'Backspace' });
83
- await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: 'Backspace', code: 'Backspace' });
84
-
85
- // Type each character via CDP Input.dispatchKeyEvent
86
- for (const char of field.value) {
87
- await cdp.Input.dispatchKeyEvent({
88
- type: 'keyDown',
89
- key: char,
90
- text: char,
87
+ if (isDateTimeRange) {
88
+ // Date/time/range inputs: set value programmatically + dispatch events
89
+ await client.Runtime.evaluate({
90
+ expression: `(function() {
91
+ const el = document.querySelector(${JSON.stringify(field.selector)});
92
+ const nativeSetter = Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, 'value').set;
93
+ nativeSetter.call(el, ${JSON.stringify(field.value)});
94
+ el.dispatchEvent(new Event('input', { bubbles: true }));
95
+ el.dispatchEvent(new Event('change', { bubbles: true }));
96
+ })()`,
97
+ returnByValue: true
91
98
  });
92
- await cdp.Input.dispatchKeyEvent({
93
- type: 'keyUp',
94
- key: char,
99
+ } else {
100
+ // Focus and clear
101
+ await client.Runtime.evaluate({
102
+ expression: `
103
+ (function() {
104
+ const el = document.querySelector(${JSON.stringify(field.selector)});
105
+ el.focus();
106
+ el.click();
107
+ if (el.select) el.select();
108
+ else if (el.setSelectionRange) el.setSelectionRange(0, el.value?.length || 0);
109
+ })()
110
+ `,
111
+ returnByValue: true
95
112
  });
113
+
114
+ // Clear existing value with select-all + delete
115
+ await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: 'a', code: 'KeyA', modifiers: 2 });
116
+ await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: 'a', code: 'KeyA', modifiers: 2 });
117
+ await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: 'Backspace', code: 'Backspace' });
118
+ await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: 'Backspace', code: 'Backspace' });
119
+
120
+ // Type each character via CDP Input.dispatchKeyEvent
121
+ for (const char of field.value) {
122
+ if (char === '\n') {
123
+ await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: 'Enter', code: 'Enter', text: '\r', windowsVirtualKeyCode: 13, nativeVirtualKeyCode: 13 });
124
+ await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: 'Enter', code: 'Enter', windowsVirtualKeyCode: 13, nativeVirtualKeyCode: 13 });
125
+ } else if (char === '\t') {
126
+ await client.Runtime.evaluate({ expression: `document.execCommand('insertText', false, '\\t')` });
127
+ } else {
128
+ await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: char, text: char });
129
+ await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: char });
130
+ }
131
+ }
96
132
  }
97
133
 
98
- // Verify the value was set
99
- const verify = await client.Runtime.evaluate({
100
- expression: `document.querySelector(${JSON.stringify(field.selector)})?.value`,
101
- returnByValue: true
102
- });
134
+ // Verify: use value for inputs, textContent for contenteditable
135
+ const verifyExpr = isContentEditable
136
+ ? `document.querySelector(${JSON.stringify(field.selector)})?.textContent?.trim()`
137
+ : `document.querySelector(${JSON.stringify(field.selector)})?.value`;
138
+ const verify = await client.Runtime.evaluate({ expression: verifyExpr, returnByValue: true });
103
139
 
104
140
  const actual = verify.result.value as string;
105
141
  if (actual === field.value) {
106
142
  results.push({ selector: field.selector, success: true });
107
143
  } else if (actual === undefined || actual === null) {
108
144
  results.push({ selector: field.selector, success: false, error: `Element not found or has no value: ${field.selector}` });
145
+ } else if (info.maxLength && actual === field.value.substring(0, info.maxLength)) {
146
+ // Maxlength truncation — fill worked within constraint
147
+ results.push({ selector: field.selector, success: true, error: `Truncated to maxlength=${info.maxLength}` });
148
+ } else if (isContentEditable && actual.includes(field.value)) {
149
+ results.push({ selector: field.selector, success: true });
109
150
  } else {
110
151
  results.push({ selector: field.selector, success: false, error: `Value mismatch: expected "${field.value}", got "${actual}"` });
111
152
  }
@@ -176,13 +217,25 @@ export async function clickElement(
176
217
  }
177
218
  if (!el && text) {
178
219
  const lower = text.toLowerCase();
179
- const all = document.querySelectorAll('a, button, input[type="submit"], [role="button"], [role="option"], [role="menuitem"], [role="listitem"], [role="tab"], [role="link"], li[aria-label], [onclick]');
220
+ const all = document.querySelectorAll('a, button, input[type="submit"], [role="button"], [role="option"], [role="menuitem"], [role="listitem"], [role="tab"], [role="link"], li[aria-label], [onclick], label');
221
+ let bestMatch = null;
222
+ let bestScore = Infinity; // lower is better
180
223
  for (const candidate of all) {
181
224
  const t = (candidate.innerText || candidate.textContent || candidate.value || candidate.getAttribute('aria-label') || '').trim();
182
- if (t.toLowerCase().includes(lower)) { el = candidate; break; }
225
+ const tLower = t.toLowerCase();
226
+ if (!tLower.includes(lower)) continue;
227
+ // Score: 0 = exact, 1 = starts-with, 2+ = contains (shorter text = better)
228
+ let score;
229
+ if (tLower === lower) score = 0;
230
+ else if (tLower.startsWith(lower)) score = 1;
231
+ else score = 2 + t.length;
232
+ if (score < bestScore) { bestMatch = candidate; bestScore = score; }
233
+ if (score === 0) break; // exact match, stop
183
234
  }
235
+ el = bestMatch;
184
236
  }
185
237
  if (!el) return { success: false, error: 'Element not found' };
238
+ if (el.disabled || el.getAttribute('aria-disabled') === 'true') return { success: false, error: 'Element is disabled' };
186
239
 
187
240
  el.scrollIntoView({ block: 'center' });
188
241
 
@@ -243,17 +296,39 @@ export async function scrollPage(
243
296
  const scrollY = Math.round(window.scrollY);
244
297
  const scrollHeight = document.documentElement.scrollHeight;
245
298
  const viewportHeight = window.innerHeight;
246
- const atBottom = (scrollY + viewportHeight) >= (scrollHeight - 10);
299
+ const atBottom = (scrollY + viewportHeight) >= (scrollHeight - 2);
247
300
 
248
- // Get visible text content
249
- const centerY = scrollY + viewportHeight / 2;
250
- const elements = document.elementsFromPoint(window.innerWidth / 2, viewportHeight / 2);
301
+ // Get visible text content from elements in the current viewport
251
302
  let contentPreview = '';
252
- for (const el of elements) {
303
+ const visibleTexts = [];
304
+ const mainEl = document.querySelector('main, article, [role="main"]') || document.body;
305
+ const allEls = mainEl.querySelectorAll('p, li, td, th, h1, h2, h3, h4, h5, h6, dd, dt, blockquote, pre');
306
+ for (const el of allEls) {
307
+ if (visibleTexts.length >= 30) break;
308
+ const rect = el.getBoundingClientRect();
309
+ // Element must be within the viewport
310
+ if (rect.bottom < 0 || rect.top > viewportHeight || rect.height === 0) continue;
311
+ // Skip fixed/sticky elements (nav, TOC, sidebars)
312
+ const style = window.getComputedStyle(el.closest('nav, aside, [role="navigation"]') || el);
313
+ if (style.position === 'fixed' || style.position === 'sticky') continue;
253
314
  const text = el.innerText?.trim();
254
- if (text && text.length > 50) {
255
- contentPreview = text.substring(0, 1500);
256
- break;
315
+ if (!text || text.length < 5) continue;
316
+ // Skip if text is too long (likely a parent container)
317
+ if (text.length > 500) continue;
318
+ // Skip duplicates
319
+ if (visibleTexts.some(t => t.includes(text) || text.includes(t))) continue;
320
+ visibleTexts.push(text);
321
+ }
322
+ contentPreview = visibleTexts.join('\\n').substring(0, 1500);
323
+ if (!contentPreview) {
324
+ // Fallback: grab from center point
325
+ const elements = document.elementsFromPoint(window.innerWidth / 2, viewportHeight / 2);
326
+ for (const el of elements) {
327
+ const text = el.innerText?.trim();
328
+ if (text && text.length > 50 && text.length < 3000) {
329
+ contentPreview = text.substring(0, 1500);
330
+ break;
331
+ }
257
332
  }
258
333
  }
259
334
 
@@ -301,8 +376,16 @@ export async function navigatePage(
301
376
  await client.Runtime.evaluate({ expression: 'window.history.forward()' });
302
377
  await new Promise(resolve => setTimeout(resolve, waitMs));
303
378
  } else if (request.url) {
379
+ // Block dangerous URL schemes
380
+ const scheme = request.url.trim().toLowerCase().split(':')[0];
381
+ if (['javascript', 'vbscript'].includes(scheme)) {
382
+ await client.close();
383
+ throw new Error('Blocked: javascript: URLs are not allowed');
384
+ }
304
385
  await (client.Page as any).navigate({ url: request.url });
305
- await (client.Page as any).loadEventFired();
386
+ // Race loadEventFired against a timeout to prevent hanging on non-loading URLs
387
+ const loadTimeout = new Promise<void>(resolve => setTimeout(resolve, Math.min(waitMs + 10000, 30000)));
388
+ await Promise.race([(client.Page as any).loadEventFired(), loadTimeout]);
306
389
  await new Promise(resolve => setTimeout(resolve, waitMs));
307
390
  }
308
391
 
@@ -336,9 +419,18 @@ export async function evalInTab(
336
419
  );
337
420
  const evalPromise = client.Runtime.evaluate({
338
421
  expression,
339
- returnByValue: true
422
+ returnByValue: true,
423
+ awaitPromise: true
340
424
  });
341
425
  const result = await Promise.race([evalPromise, timeout]);
426
+ // Check for exceptions (syntax errors, thrown errors, etc.)
427
+ if (result.exceptionDetails) {
428
+ const desc = result.exceptionDetails.exception?.description
429
+ || result.exceptionDetails.text
430
+ || 'Unknown error';
431
+ await client.close();
432
+ return { __error: desc };
433
+ }
342
434
  await client.close();
343
435
  return result.result.value ?? null;
344
436
  } catch (error) {
@@ -683,3 +775,43 @@ export async function focusTab(
683
775
  throw error;
684
776
  }
685
777
  }
778
+
779
+ // Raw CDP key typing — no clear step, no element focus. Types directly into whatever has focus.
780
+ // Designed for apps like Google Sheets where Ctrl+A/Backspace clear causes side effects.
781
+ export async function typeKeys(
782
+ tabPattern: string,
783
+ keys: string,
784
+ options: { port?: number; host?: string; submit?: string }
785
+ ): Promise<{ typed: number; submitted?: boolean }> {
786
+ const port = options.port || 9222;
787
+ const host = options.host || 'localhost';
788
+
789
+ const tab = await resolveTab(tabPattern, port, host);
790
+ const client = await connectToTab(tab.id, port, host);
791
+ const cdp = client as any;
792
+
793
+ try {
794
+ // Type each character via CDP Input.dispatchKeyEvent
795
+ for (const char of keys) {
796
+ await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: char, text: char });
797
+ await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: char });
798
+ }
799
+
800
+ let submitted = false;
801
+ if (options.submit === 'enter') {
802
+ await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: 'Enter', code: 'Enter', windowsVirtualKeyCode: 13, nativeVirtualKeyCode: 13 });
803
+ await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: 'Enter', code: 'Enter', windowsVirtualKeyCode: 13, nativeVirtualKeyCode: 13 });
804
+ submitted = true;
805
+ } else if (options.submit === 'tab') {
806
+ await cdp.Input.dispatchKeyEvent({ type: 'keyDown', key: 'Tab', code: 'Tab', windowsVirtualKeyCode: 9, nativeVirtualKeyCode: 9 });
807
+ await cdp.Input.dispatchKeyEvent({ type: 'keyUp', key: 'Tab', code: 'Tab', windowsVirtualKeyCode: 9, nativeVirtualKeyCode: 9 });
808
+ submitted = true;
809
+ }
810
+
811
+ await client.close();
812
+ return { typed: keys.length, submitted };
813
+ } catch (error) {
814
+ await client.close();
815
+ throw error;
816
+ }
817
+ }
package/src/api/recon.ts CHANGED
@@ -25,7 +25,9 @@ export interface ReconResult {
25
25
  role: string | null;
26
26
  x: number;
27
27
  y: number;
28
+ data?: Record<string, string>;
28
29
  }[];
30
+ totalElements: number;
29
31
  forms: {
30
32
  action: string | null;
31
33
  method: string | null;
@@ -77,7 +79,14 @@ const EXTRACTION_SCRIPT = `
77
79
  const tag = el.tagName.toLowerCase();
78
80
  if (el.getAttribute('aria-label')) return tag + '[aria-label="' + el.getAttribute('aria-label') + '"]';
79
81
  if (el.getAttribute('data-testid')) return '[data-testid="' + el.getAttribute('data-testid') + '"]';
80
- if (el.getAttribute('name')) return tag + '[name="' + el.getAttribute('name') + '"]';
82
+ if (el.getAttribute('name')) {
83
+ const nameSelector = tag + '[name="' + el.getAttribute('name') + '"]';
84
+ // Disambiguate radio/checkbox with same name by adding value
85
+ if ((el.type === 'radio' || el.type === 'checkbox') && el.value) {
86
+ return nameSelector + '[value="' + el.value + '"]';
87
+ }
88
+ return nameSelector;
89
+ }
81
90
  // Positional fallback
82
91
  const parent = el.parentElement;
83
92
  if (!parent) return tag;
@@ -153,7 +162,12 @@ const EXTRACTION_SCRIPT = `
153
162
  selector: buildSelector(el),
154
163
  role: el.getAttribute('role'),
155
164
  x: Math.round(rect.x),
156
- y: Math.round(rect.y)
165
+ y: Math.round(rect.y),
166
+ ...(el.dataset && Object.keys(el.dataset).length > 0 ? {
167
+ data: Object.fromEntries(
168
+ ['date','iso','value','testid','id'].filter(k => el.dataset[k]).map(k => [k, el.dataset[k]])
169
+ )
170
+ } : {})
157
171
  });
158
172
  }
159
173
  }
@@ -255,6 +269,16 @@ const EXTRACTION_SCRIPT = `
255
269
  }
256
270
  }
257
271
  }
272
+ // Deduplicate: remove overlays that are descendants of other overlays
273
+ const deduped = overlays.filter((o, i) => {
274
+ const el = document.querySelector(o.selector);
275
+ if (!el) return true;
276
+ return !overlays.some((other, j) => {
277
+ if (i === j) return false;
278
+ const otherEl = document.querySelector(other.selector);
279
+ return otherEl && otherEl !== el && otherEl.contains(el);
280
+ });
281
+ });
258
282
 
259
283
  // ---- Captcha detection ----
260
284
  const captchas = [];
@@ -285,9 +309,10 @@ const EXTRACTION_SCRIPT = `
285
309
  headings,
286
310
  navigation: navigation.slice(0, 50),
287
311
  elements: elements.slice(0, 150),
312
+ totalElements: elements.length,
288
313
  forms,
289
314
  landmarks,
290
- overlays,
315
+ overlays: deduped,
291
316
  captchas,
292
317
  contentSummary
293
318
  };
@@ -352,6 +377,7 @@ export async function reconUrl(
352
377
  headings: data.headings,
353
378
  navigation: data.navigation,
354
379
  elements: data.elements,
380
+ totalElements: data.totalElements || data.elements?.length || 0,
355
381
  forms: data.forms,
356
382
  contentSummary: data.contentSummary,
357
383
  landmarks: data.landmarks,
@@ -414,6 +440,7 @@ export async function reconTab(
414
440
  headings: data.headings,
415
441
  navigation: data.navigation,
416
442
  elements: data.elements,
443
+ totalElements: data.totalElements || data.elements?.length || 0,
417
444
  forms: data.forms,
418
445
  contentSummary: data.contentSummary,
419
446
  landmarks: data.landmarks,
package/src/api/server.ts CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  import http from 'node:http';
4
4
  import { reconUrl, reconTab } from './recon.js';
5
- import { fillFields, clickElement, scrollPage, navigatePage, evalInTab, focusTab, readPage, captchaInteract, dismissOverlays } from './act.js';
5
+ import { fillFields, clickElement, scrollPage, navigatePage, evalInTab, focusTab, readPage, captchaInteract, dismissOverlays, typeKeys } from './act.js';
6
6
  import { getAllTabs } from '../chrome/tabs.js';
7
7
 
8
8
  const PORT = parseInt(process.env.API_PORT || '3456', 10);
@@ -160,15 +160,34 @@ const server = http.createServer(async (req, res) => {
160
160
  return json(res, 400, { error: 'Provide "tab" and "expression"' });
161
161
  }
162
162
  const result = await evalInTab(body.tab, body.expression, { port: CDP_PORT, host: CDP_HOST });
163
+ if (result && result.__error) {
164
+ return json(res, 200, { result: null, error: result.__error });
165
+ }
163
166
  return json(res, 200, { result });
164
167
  }
165
168
 
169
+ // POST /type — raw CDP key typing, no clear step (for Google Sheets, contenteditable, etc.)
170
+ if (path === '/type' && req.method === 'POST') {
171
+ const body = parseBody(await readBody(req));
172
+ if (!body.tab || !body.keys) {
173
+ return json(res, 400, { error: 'Provide "tab" and "keys" (string to type), optional "submit": "enter"|"tab"' });
174
+ }
175
+ const result = await typeKeys(body.tab, body.keys, { port: CDP_PORT, host: CDP_HOST, submit: body.submit });
176
+ return json(res, 200, result);
177
+ }
178
+
166
179
  // POST /navigate — go to url, back, or forward in same tab
167
180
  if (path === '/navigate' && req.method === 'POST') {
168
181
  const body = parseBody(await readBody(req));
169
182
  if (!body.tab) {
170
183
  return json(res, 400, { error: 'Provide "tab" and one of: "url", "back":true, "forward":true' });
171
184
  }
185
+ if (!body.url && !body.back && !body.forward) {
186
+ return json(res, 400, { error: 'Provide one of: "url", "back":true, "forward":true' });
187
+ }
188
+ if ((body.url && body.back) || (body.url && body.forward) || (body.back && body.forward)) {
189
+ return json(res, 400, { error: 'Provide only one of: "url", "back", "forward"' });
190
+ }
172
191
  const result = await navigatePage(body, { port: CDP_PORT, host: CDP_HOST });
173
192
  return json(res, 200, result);
174
193
  }