athena-browser-mcp 2.0.4 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/README.md +83 -204
  2. package/dist/src/browser/page-network-tracker.d.ts +88 -0
  3. package/dist/src/browser/page-network-tracker.d.ts.map +1 -0
  4. package/dist/src/browser/page-network-tracker.js +271 -0
  5. package/dist/src/browser/page-network-tracker.js.map +1 -0
  6. package/dist/src/browser/page-stabilization.d.ts +35 -0
  7. package/dist/src/browser/page-stabilization.d.ts.map +1 -0
  8. package/dist/src/browser/page-stabilization.js +42 -0
  9. package/dist/src/browser/page-stabilization.js.map +1 -0
  10. package/dist/src/browser/session-manager.d.ts +4 -0
  11. package/dist/src/browser/session-manager.d.ts.map +1 -1
  12. package/dist/src/browser/session-manager.js +34 -0
  13. package/dist/src/browser/session-manager.js.map +1 -1
  14. package/dist/src/observation/eid-linker.d.ts +84 -0
  15. package/dist/src/observation/eid-linker.d.ts.map +1 -0
  16. package/dist/src/observation/eid-linker.js +268 -0
  17. package/dist/src/observation/eid-linker.js.map +1 -0
  18. package/dist/src/observation/index.d.ts +12 -0
  19. package/dist/src/observation/index.d.ts.map +1 -0
  20. package/dist/src/observation/index.js +15 -0
  21. package/dist/src/observation/index.js.map +1 -0
  22. package/dist/src/observation/observation-accumulator.d.ts +58 -0
  23. package/dist/src/observation/observation-accumulator.d.ts.map +1 -0
  24. package/dist/src/observation/observation-accumulator.js +213 -0
  25. package/dist/src/observation/observation-accumulator.js.map +1 -0
  26. package/dist/src/observation/observation.types.d.ts +108 -0
  27. package/dist/src/observation/observation.types.d.ts.map +1 -0
  28. package/dist/src/observation/observation.types.js +44 -0
  29. package/dist/src/observation/observation.types.js.map +1 -0
  30. package/dist/src/observation/observer-script.d.ts +19 -0
  31. package/dist/src/observation/observer-script.d.ts.map +1 -0
  32. package/dist/src/observation/observer-script.js +519 -0
  33. package/dist/src/observation/observer-script.js.map +1 -0
  34. package/dist/src/snapshot/snapshot.types.d.ts +6 -0
  35. package/dist/src/snapshot/snapshot.types.d.ts.map +1 -1
  36. package/dist/src/snapshot/snapshot.types.js.map +1 -1
  37. package/dist/src/state/diff-engine.d.ts.map +1 -1
  38. package/dist/src/state/diff-engine.js +129 -1
  39. package/dist/src/state/diff-engine.js.map +1 -1
  40. package/dist/src/state/state-manager.d.ts.map +1 -1
  41. package/dist/src/state/state-manager.js +9 -0
  42. package/dist/src/state/state-manager.js.map +1 -1
  43. package/dist/src/state/state-renderer.d.ts +13 -0
  44. package/dist/src/state/state-renderer.d.ts.map +1 -1
  45. package/dist/src/state/state-renderer.js +172 -2
  46. package/dist/src/state/state-renderer.js.map +1 -1
  47. package/dist/src/state/types.d.ts +37 -0
  48. package/dist/src/state/types.d.ts.map +1 -1
  49. package/dist/src/tools/browser-tools.d.ts.map +1 -1
  50. package/dist/src/tools/browser-tools.js +15 -1
  51. package/dist/src/tools/browser-tools.js.map +1 -1
  52. package/dist/src/tools/execute-action.d.ts +22 -6
  53. package/dist/src/tools/execute-action.d.ts.map +1 -1
  54. package/dist/src/tools/execute-action.js +80 -21
  55. package/dist/src/tools/execute-action.js.map +1 -1
  56. package/dist/src/tools/tool-schemas.d.ts +68 -68
  57. package/package.json +2 -2
package/README.md CHANGED
@@ -1,263 +1,142 @@
1
1
  # Athena Browser MCP
2
2
 
3
- [![CI](https://github.com/lespaceman/athena-browser-mcp/actions/workflows/ci.yml/badge.svg)](https://github.com/lespaceman/athena-browser-mcp/actions/workflows/ci.yml)
4
- [![npm version](https://badge.fury.io/js/athena-browser-mcp.svg)](https://www.npmjs.com/package/athena-browser-mcp)
5
- [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
3
+ An MCP server for browser automation that exposes semantic, token-efficient page representations optimized for LLM agents.
6
4
 
7
- MCP server for AI browser automation - 18 tools with semantic element targeting.
5
+ ---
8
6
 
9
- ## Design Philosophy
7
+ ## Motivation
10
8
 
11
- 1. **Semantic element IDs** - Stable `eid` references survive DOM mutations
12
- 2. **XML state responses** - Structured page state with layers, actionables, and diffs
13
- 3. **Multi-frame support** - Extract content from iframes (cookie consent, widgets)
14
- 4. **Automatic retry** - Stale element recovery with fresh snapshot
9
+ LLM-based agents operate under strict context window and token constraints.
10
+ However, most browser automation tools expose entire DOMs or full accessibility trees to the model.
15
11
 
16
- ## Architecture
12
+ This leads to:
17
13
 
18
- ```
19
- ┌─────────────────────────────────────────────────────────────────┐
20
- │ AI Agent │
21
- │ ┌────────────────────────────────────────────────────────────┐ │
22
- │ │ System Prompt: XML state (layers, actionables, atoms) │ │
23
- │ └────────────────────────────────────────────────────────────┘ │
24
- └───────────────────────────┬─────────────────────────────────────┘
25
- │ MCP Protocol (stdio)
26
- ┌───────────────────────────▼─────────────────────────────────────┐
27
- │ SESSION: launch_browser, connect_browser, close_page, │
28
- │ close_session │
29
- │ NAVIGATION: navigate, go_back, go_forward, reload │
30
- │ OBSERVATION: capture_snapshot, find_elements, get_node_details │
31
- │ INTERACTION: click, type, press, select, hover, │
32
- │ scroll_element_into_view, scroll_page │
33
- └───────────────────────────┬─────────────────────────────────────┘
34
- │ Playwright + CDP
35
- ┌───────────────────────────▼─────────────────────────────────────┐
36
- │ Chromium Browser │
37
- └─────────────────────────────────────────────────────────────────┘
38
- ```
39
-
40
- ## Tools
41
-
42
- ### Session
43
-
44
- | Tool | Purpose | Input |
45
- | ----------------- | ------------------------- | ------------------- |
46
- | `launch_browser` | Launch new browser | `{ headless? }` |
47
- | `connect_browser` | Connect to existing (CDP) | `{ endpoint_url? }` |
48
- | `close_page` | Close specific page | `{ page_id }` |
49
- | `close_session` | Close entire browser | `{}` |
14
+ - Rapid token exhaustion
15
+ - Higher inference costs
16
+ - Reduced reliability as relevant signal is buried in noise
50
17
 
51
- ### Navigation
18
+ In practice, agents spend more effort _finding_ the right information than reasoning about it.
52
19
 
53
- | Tool | Purpose | Input |
54
- | ------------ | --------------- | ------------------- |
55
- | `navigate` | Go to URL | `{ url, page_id? }` |
56
- | `go_back` | Browser back | `{ page_id? }` |
57
- | `go_forward` | Browser forward | `{ page_id? }` |
58
- | `reload` | Refresh page | `{ page_id? }` |
20
+ Athena exists to change the unit of information exposed to the model.
59
21
 
60
- ### Observation
22
+ ---
61
23
 
62
- | Tool | Purpose | Input |
63
- | ------------------ | ------------------- | ----------------------------------------------------------------- |
64
- | `capture_snapshot` | Capture page state | `{ page_id? }` |
65
- | `find_elements` | Find by criteria | `{ kind?, label?, region?, limit?, include_readable?, page_id? }` |
66
- | `get_node_details` | Get element details | `{ eid, page_id? }` |
24
+ ## Core Idea: Semantic Page Snapshots
67
25
 
68
- ### Interaction
26
+ Instead of exposing raw DOM structures or full accessibility trees, Athena produces **semantic page snapshots**.
69
27
 
70
- | Tool | Purpose | Input |
71
- | -------------------------- | ------------------ | ---------------------------------- |
72
- | `click` | Click element | `{ eid, page_id? }` |
73
- | `type` | Type text | `{ eid, text, clear?, page_id? }` |
74
- | `press` | Press keyboard key | `{ key, modifiers?, page_id? }` |
75
- | `select` | Select option | `{ eid, value, page_id? }` |
76
- | `hover` | Hover element | `{ eid, page_id? }` |
77
- | `scroll_element_into_view` | Scroll to element | `{ eid, page_id? }` |
78
- | `scroll_page` | Scroll viewport | `{ direction, amount?, page_id? }` |
28
+ These snapshots are:
79
29
 
80
- ## Element IDs (eid)
30
+ - Compact and structured
31
+ - Focused on user-visible intent
32
+ - Designed for LLM recall and reasoning, not DOM completeness
33
+ - Stable across layout shifts and DOM churn
81
34
 
82
- Elements are identified by stable semantic IDs (`eid`) instead of transient DOM node IDs:
35
+ The goal is not to mirror the browser, but to present the page in a form that aligns with how language models reason about interfaces.
83
36
 
84
- ```xml
85
- <match eid="a1b2c3d4e5f6" kind="button" label="Sign In" region="header" />
86
- ```
37
+ ---
87
38
 
88
- EIDs are computed from:
89
-
90
- - Role/kind (button, link, input)
91
- - Accessible name (label text)
92
- - Landmark path (region + group hierarchy)
93
- - Position hint (screen zone, quadrant)
94
-
95
- This means the same logical element keeps its `eid` across page updates.
96
-
97
- ## Response Format
98
-
99
- Tools return XML state responses with page understanding:
100
-
101
- ```xml
102
- <state page_id="abc123" url="https://example.com" title="Example">
103
- <layer type="main" active="true">
104
- <actionables count="12">
105
- <el eid="a1b2c3" kind="button" label="Sign In" />
106
- <el eid="d4e5f6" kind="link" label="Forgot password?" />
107
- <el eid="g7h8i9" kind="input" label="Email" type="email" />
108
- </actionables>
109
- </layer>
110
- <atoms>
111
- <viewport w="1280" h="720" />
112
- <scroll x="0" y="0" />
113
- </atoms>
114
- </state>
115
- ```
39
+ ## How It Works
116
40
 
117
- ### Layer Types
41
+ At a high level:
118
42
 
119
- | Layer | Description |
120
- | --------- | -------------------------- |
121
- | `main` | Primary page content |
122
- | `modal` | Dialog overlays |
123
- | `drawer` | Slide-in panels |
124
- | `popover` | Dropdowns, tooltips, menus |
43
+ 1. The browser is controlled via Playwright and CDP
44
+ 2. The page is reduced into semantic regions and actionable elements
45
+ 3. A structured snapshot is generated and sent to the LLM
46
+ 4. Actions are resolved against stable semantic identifiers rather than fragile selectors
125
47
 
126
- ## Usage Examples
48
+ This separation keeps:
127
49
 
128
- ### Login Flow
50
+ - Browser lifecycle management isolated
51
+ - Snapshots deterministic and low-entropy
52
+ - Agent reasoning predictable and efficient
129
53
 
130
- ```
131
- 1. launch_browser { }
132
- → XML state with initial page
54
+ ---
133
55
 
134
- 2. navigate { url: "https://example.com/login" }
135
- → State shows login form elements
56
+ ## Benchmarks
136
57
 
137
- 3. find_elements { kind: "input", label: "email" }
138
- → <match eid="abc123" kind="input" label="Email" />
58
+ Early benchmarks against Playwright MCP show:
139
59
 
140
- 4. click { eid: "abc123" }
141
- Element focused
60
+ - **~19% fewer tokens consumed**
61
+ - **~33% faster task completion**
62
+ - Same or better success rates on common navigation tasks
142
63
 
143
- 5. type { eid: "abc123", text: "user@example.com" }
144
- Value filled
64
+ Benchmarks were run using Claude Code on representative real-world tasks.
65
+ Results are task-dependent and should be treated as directional rather than absolute.
145
66
 
146
- 6. press { key: "Tab" }
147
- → Focus moved to password field
67
+ ---
148
68
 
149
- 7. type { eid: "def456", text: "password123" }
150
- → Password filled
151
-
152
- 8. press { key: "Enter" }
153
- → Form submitted, navigation to dashboard
154
- ```
69
+ ## What Athena Is (and Is Not)
155
70
 
156
- ### Cookie Consent (Multi-Frame)
157
-
158
- ```
159
- 1. navigate { url: "https://news-site.com" }
160
- → Modal layer detected (cookie consent iframe)
161
-
162
- 2. find_elements { label: "Accept", kind: "button" }
163
- → <match eid="xyz789" kind="button" label="Accept All" />
164
-
165
- 3. click { eid: "xyz789" }
166
- → Modal closed, main layer active
167
- ```
168
-
169
- ## Installation
170
-
171
- ```bash
172
- npm install
173
- npm run build
174
- ```
71
+ ### Athena is:
175
72
 
176
- ## Configuration
73
+ - A semantic interface between browsers and LLM agents
74
+ - An MCP server focused on reliability and efficiency
75
+ - Designed for agent workflows, not test automation
177
76
 
178
- ### Claude Desktop
77
+ ### Athena is not:
179
78
 
180
- Add to your Claude Desktop config:
79
+ - A general-purpose browser
80
+ - A visual testing or screenshot framework
81
+ - A replacement for Playwright
181
82
 
182
- **macOS**: `~/Library/Application Support/Claude/claude_desktop_config.json`
183
- **Windows**: `%APPDATA%\Claude\claude_desktop_config.json`
184
- **Linux**: `~/.config/Claude/claude_desktop_config.json`
83
+ Playwright remains the execution layer; Athena focuses on representation and reasoning.
185
84
 
186
- ```json
187
- {
188
- "mcpServers": {
189
- "browser": {
190
- "command": "npx",
191
- "args": ["athena-browser-mcp@latest"]
192
- }
193
- }
194
- }
195
- ```
85
+ ---
196
86
 
197
- ### Claude Code
87
+ ## Usage
198
88
 
199
- ```bash
200
- claude mcp add athena-browser-mcp npx athena-browser-mcp@latest
201
- ```
89
+ Athena implements the **Model Context Protocol (MCP)** and works with:
202
90
 
203
- ### VS Code
91
+ - Claude Code
92
+ - Claude Desktop
93
+ - Cursor
94
+ - VS Code
95
+ - Any MCP-compatible client
204
96
 
205
- ```bash
206
- code --add-mcp '{"name":"athena-browser-mcp","command":"npx","args":["athena-browser-mcp@latest"]}'
207
- ```
97
+ Example workflows include:
208
98
 
209
- ### Cursor
99
+ - Navigating complex web apps
100
+ - Handling login and consent flows
101
+ - Performing multi-step UI interactions with lower token usage
210
102
 
211
- Go to **Cursor Settings MCP Add new MCP Server**. Use command type with:
103
+ See the `examples/` directory for concrete agent workflows.
212
104
 
213
- ```
214
- npx athena-browser-mcp@latest
215
- ```
105
+ ---
216
106
 
217
- ### Codex
107
+ ## Installation
218
108
 
219
109
  ```bash
220
- codex mcp add athena-browser-mcp npx athena-browser-mcp@latest
110
+ git clone https://github.com/lespaceman/athena-browser-mcp
111
+ cd athena-browser-mcp
112
+ npm install
113
+ npm run build
221
114
  ```
222
115
 
223
- ### Gemini CLI
116
+ Configure the MCP server in your client according to its MCP integration instructions.
224
117
 
225
- ```bash
226
- gemini mcp add -s user athena-browser-mcp -- npx athena-browser-mcp@latest
227
- ```
118
+ ---
228
119
 
229
- ### Connect to Existing Browser
120
+ ## Architecture Overview
230
121
 
231
- To connect to an existing Chromium browser with CDP enabled:
122
+ Athena separates concerns into three layers:
232
123
 
233
- ```bash
234
- # Start Chrome with remote debugging
235
- google-chrome --remote-debugging-port=9222
124
+ - **Browser lifecycle** — page creation, navigation, teardown
125
+ - **Semantic snapshot generation** regions, elements, identifiers
126
+ - **Action resolution** — mapping agent intent to browser actions
236
127
 
237
- # Or use environment variables
238
- export CEF_BRIDGE_HOST=127.0.0.1
239
- export CEF_BRIDGE_PORT=9222
240
- ```
128
+ This separation allows each layer to evolve independently while keeping agent-visible behavior stable.
241
129
 
242
- Then use `connect_browser` instead of `launch_browser`.
130
+ ---
243
131
 
244
- ### Environment Variables
132
+ ## Status
245
133
 
246
- | Variable | Description | Default |
247
- | ----------------- | -------------------- | ----------- |
248
- | `CEF_BRIDGE_HOST` | CDP host for connect | `127.0.0.1` |
249
- | `CEF_BRIDGE_PORT` | CDP port for connect | `9223` |
134
+ Athena is under active development.
135
+ APIs and snapshot formats may evolve as real-world agent usage informs the design.
250
136
 
251
- ## Development
137
+ Feedback from practitioners building agent systems is especially welcome.
252
138
 
253
- ```bash
254
- npm run build # Compile TypeScript
255
- npm run type-check # TypeScript type checking
256
- npm run lint # ESLint
257
- npm run format # Prettier format
258
- npm run check # Run all checks
259
- npm test # Run tests
260
- ```
139
+ ---
261
140
 
262
141
  ## License
263
142
 
@@ -0,0 +1,88 @@
1
+ /**
2
+ * Page Network Tracker
3
+ *
4
+ * Tracks in-flight network requests for a page and provides a reliable
5
+ * "network quiet" wait mechanism. Unlike Playwright's waitForLoadState('networkidle'),
6
+ * this tracks requests triggered after page load (e.g., by user actions).
7
+ *
8
+ * Uses a generation counter to safely handle navigation - late events from
9
+ * previous documents are ignored.
10
+ */
11
+ import type { Page } from 'playwright';
12
+ /**
13
+ * Tracks network requests for a single page.
14
+ *
15
+ * Attach to a page via `attach()`, then use `waitForQuiet()` to wait for
16
+ * network activity to settle. Call `markNavigation()` when navigating to
17
+ * safely reset state without race conditions.
18
+ */
19
+ export declare class PageNetworkTracker {
20
+ private page;
21
+ private inflightCount;
22
+ private generation;
23
+ private currentGeneration;
24
+ private quietTimer;
25
+ private quietWindowMs;
26
+ private quietResolvers;
27
+ private onRequest;
28
+ private onRequestFinished;
29
+ private onRequestFailed;
30
+ /**
31
+ * Attach network event listeners to a page.
32
+ *
33
+ * Must be called before `waitForQuiet()` can be used.
34
+ * Safe to call multiple times - will detach previous listeners first.
35
+ */
36
+ attach(page: Page): void;
37
+ /**
38
+ * Detach all event listeners and cleanup timers.
39
+ *
40
+ * Call this when the page is closed or no longer needs tracking.
41
+ */
42
+ detach(): void;
43
+ /**
44
+ * Mark that a navigation occurred.
45
+ *
46
+ * This safely resets state by bumping the generation counter, so any
47
+ * late events from the previous document are ignored. Use this instead
48
+ * of directly resetting state to avoid race conditions.
49
+ */
50
+ markNavigation(): void;
51
+ /**
52
+ * Wait for network to become quiet (no inflight requests for quietWindowMs).
53
+ *
54
+ * @param timeoutMs - Maximum time to wait before returning false
55
+ * @param quietWindowMs - Time with 0 inflight requests to consider "idle"
56
+ * @returns true if network became quiet, false if timed out (never throws)
57
+ */
58
+ waitForQuiet(timeoutMs: number, quietWindowMs?: number): Promise<boolean>;
59
+ /**
60
+ * Get current inflight request count (for debugging/testing).
61
+ */
62
+ getInflightCount(): number;
63
+ /**
64
+ * Check if tracker is attached to a page.
65
+ */
66
+ isAttached(): boolean;
67
+ private cancelQuietTimer;
68
+ private checkQuiet;
69
+ private startQuietTimer;
70
+ }
71
+ /**
72
+ * Get or create a network tracker for a page.
73
+ *
74
+ * Note: This does NOT automatically attach the tracker.
75
+ * Call `tracker.attach(page)` after getting the tracker.
76
+ */
77
+ export declare function getOrCreateTracker(page: Page): PageNetworkTracker;
78
+ /**
79
+ * Remove and detach the tracker for a page.
80
+ *
81
+ * Call this when a page is closed to ensure proper cleanup.
82
+ */
83
+ export declare function removeTracker(page: Page): void;
84
+ /**
85
+ * Check if a page has a tracker attached.
86
+ */
87
+ export declare function hasTracker(page: Page): boolean;
88
+ //# sourceMappingURL=page-network-tracker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"page-network-tracker.d.ts","sourceRoot":"","sources":["../../../src/browser/page-network-tracker.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,KAAK,EAAE,IAAI,EAAW,MAAM,YAAY,CAAC;AAKhD;;;;;;GAMG;AACH,qBAAa,kBAAkB;IAC7B,OAAO,CAAC,IAAI,CAAqB;IACjC,OAAO,CAAC,aAAa,CAAK;IAC1B,OAAO,CAAC,UAAU,CAAK;IACvB,OAAO,CAAC,iBAAiB,CAAK;IAG9B,OAAO,CAAC,UAAU,CAA+B;IACjD,OAAO,CAAC,aAAa,CAAmC;IACxD,OAAO,CAAC,cAAc,CAAyE;IAG/F,OAAO,CAAC,SAAS,CAAyC;IAC1D,OAAO,CAAC,iBAAiB,CAAyC;IAClE,OAAO,CAAC,eAAe,CAAyC;IAEhE;;;;;OAKG;IACH,MAAM,CAAC,IAAI,EAAE,IAAI,GAAG,IAAI;IA6CxB;;;;OAIG;IACH,MAAM,IAAI,IAAI;IA6Bd;;;;;;OAMG;IACH,cAAc,IAAI,IAAI;IAkDtB;;;;;;OAMG;IACG,YAAY,CAChB,SAAS,EAAE,MAAM,EACjB,aAAa,GAAE,MAAgC,GAC9C,OAAO,CAAC,OAAO,CAAC;IAuBnB;;OAEG;IACH,gBAAgB,IAAI,MAAM;IAI1B;;OAEG;IACH,UAAU,IAAI,OAAO;IAMrB,OAAO,CAAC,gBAAgB;IAOxB,OAAO,CAAC,UAAU;IAMlB,OAAO,CAAC,eAAe;CAexB;AAYD;;;;;GAKG;AACH,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,IAAI,GAAG,kBAAkB,CAOjE;AAED;;;;GAIG;AACH,wBAAgB,aAAa,CAAC,IAAI,EAAE,IAAI,GAAG,IAAI,CAM9C;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,IAAI,EAAE,IAAI,GAAG,OAAO,CAE9C"}