@porcupine/kuskus 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(npm install 2>&1)",
5
+ "Bash(node -e \"import\\('@modelcontextprotocol/sdk/server/stdio.js'\\).then\\(m => console.log\\(Object.keys\\(m\\)\\)\\).catch\\(e => console.error\\(e.message\\)\\)\" && node -e \"import\\('zod'\\).then\\(m => console.log\\('zod ok'\\)\\).catch\\(e => console.error\\(e.message\\)\\)\")",
6
+ "Bash(npx vitest run --reporter=verbose 2>&1)"
7
+ ]
8
+ }
9
+ }
package/.env.example ADDED
@@ -0,0 +1,19 @@
1
+ # ── CLI only ──────────────────────────────────────────────────────────────────
2
+ # Required only for `kuskus run/repl/script` commands.
3
+ # The MCP server does NOT use an API key — the host model drives the agent.
4
+ ANTHROPIC_API_KEY=sk-ant-...
5
+ AGENT_MODEL=claude-sonnet-4-6
6
+ AGENT_MAX_STEPS=20
7
+ AGENT_MAX_TOKENS=4096
8
+ AGENT_INCLUDE_SCREENSHOT=true
9
+ AGENT_SCREENSHOT_QUALITY=80
10
+
11
+ # ── Browser (CLI + MCP) ────────────────────────────────────────────────────────
12
+ CDP_URL=ws://localhost:9222
13
+ CDP_LAUNCH_BROWSER=false
14
+ CDP_BROWSER_PATH=lightpanda
15
+ CDP_BROWSER_PORT=9222
16
+
17
+ # ── Logging ───────────────────────────────────────────────────────────────────
18
+ LOG_LEVEL=info
19
+ LOG_FORMAT=pretty
package/SPEC.md ADDED
@@ -0,0 +1,510 @@
1
+ # CDP Browser Agent — Project Specification
2
+
3
+ ## Overview
4
+
5
+ A Node.js-based AI agent that controls a browser via the Chrome DevTools Protocol (CDP),
6
+ similar to browser-use but without Playwright. The agent receives natural language tasks,
7
+ translates them into CDP commands, and executes them against a running Lightpanda browser.
8
+
9
+ Ships as two artifacts:
10
+ - **CLI** — interactive / scripted terminal usage
11
+ - **MCP Server** — Model Context Protocol server for integration with Claude Desktop, Cursor, etc.
12
+
13
+ ---
14
+
15
+ ## Browser Runtime
16
+
17
+ **Lightpanda** (`github.com/lightpanda-io/browser`)
18
+ - Headless browser with native CDP support
19
+ - Chromium-compatible DevTools Protocol endpoint
20
+ - Starts with: `lightpanda --remote-debugging-port=9222`
21
+ - CDP WebSocket URL: `ws://localhost:9222/json` → enumerate targets, then connect per-target
22
+
23
+ ---
24
+
25
+ ## Architecture
26
+
27
+ ```
28
+ ┌─────────────────────────────────────────────────┐
29
+ │ Entry Points │
30
+ │ ┌──────────────────┐ ┌────────────────────┐ │
31
+ │ │ CLI │ │ MCP Server │ │
32
+ │ │ (bin/cli.js) │ │ (bin/mcp.js) │ │
33
+ │ └────────┬─────────┘ └────────┬───────────┘ │
34
+ └───────────┼─────────────────────┼───────────────┘
35
+ │ │
36
+ └──────────┬──────────┘
37
+
38
+ ┌────────────────────────┐
39
+ │ Agent Core │
40
+ │ (src/agent/index.js) │
41
+ │ - Task planner │
42
+ │ - Step executor │
43
+ │ - Memory / context │
44
+ └────────────┬───────────┘
45
+
46
+ ┌────────────▼───────────┐
47
+ │ Browser Client │
48
+ │ (src/cdp/client.js) │
49
+ │ - CDP WebSocket conn │
50
+ │ - Domain managers │
51
+ └────────────┬───────────┘
52
+
53
+ ┌────────────▼───────────┐
54
+ │ Lightpanda Browser │
55
+ │ CDP ws://localhost:9222│
56
+ └────────────────────────┘
57
+ ```
58
+
59
+ ---
60
+
61
+ ## Directory Structure
62
+
63
+ ```
64
+ cdp-browser-agent/
65
+ ├── package.json
66
+ ├── .env.example
67
+ ├── bin/
68
+ │ ├── cli.js # CLI entrypoint
69
+ │ └── mcp.js # MCP server entrypoint
70
+ ├── src/
71
+ │ ├── cdp/
72
+ │ │ ├── client.js # Low-level CDP WebSocket client
73
+ │ │ ├── session.js # Session / target management
74
+ │ │ └── domains/
75
+ │ │ ├── page.js # Page domain (navigate, screenshot, reload)
76
+ │ │ ├── dom.js # DOM domain (query, describe, highlight)
77
+ │ │ ├── input.js # Input domain (mouse, keyboard, touch)
78
+ │ │ ├── runtime.js # Runtime domain (evaluate JS)
79
+ │ │ ├── network.js # Network domain (intercept, monitor)
80
+ │ │ └── target.js # Target domain (tabs management)
81
+ │ ├── agent/
82
+ │ │ ├── index.js # Agent orchestrator
83
+ │ │ ├── planner.js # LLM-based task planner
84
+ │ │ ├── executor.js # Tool call executor
85
+ │ │ ├── tools.js # Tool definitions (schema + handlers)
86
+ │ │ ├── memory.js # Short-term context window manager
87
+ │ │ └── prompts.js # System prompts
88
+ │ ├── mcp/
89
+ │ │ ├── server.js # MCP server setup (stdio transport)
90
+ │ │ └── handlers.js # MCP tool/resource handlers
91
+ │ └── utils/
92
+ │ ├── screenshot.js # Screenshot capture + base64 encode
93
+ │ ├── dom-to-text.js # DOM serializer → readable text/markdown
94
+ │ └── logger.js # Structured logger (pino)
95
+ ├── tests/
96
+ │ ├── cdp/
97
+ │ └── agent/
98
+ └── examples/
99
+ ├── search-google.js
100
+ └── fill-form.js
101
+ ```
102
+
103
+ ---
104
+
105
+ ## Core Modules
106
+
107
+ ### 1. CDP Client (`src/cdp/client.js`)
108
+
109
+ Connects to the Lightpanda CDP endpoint via WebSocket.
110
+
111
+ ```js
112
+ // Interface
113
+ class CDPClient {
114
+ constructor(options: { host, port, targetId? })
115
+ async connect(): Promise<void>
116
+ async send(method: string, params?: object): Promise<any>
117
+ on(event: string, handler: Function): void
118
+ async close(): Promise<void>
119
+ }
120
+ ```
121
+
122
+ - Uses `ws` npm package
123
+ - Implements CDP message ID tracking (auto-increment)
124
+ - Handles CDP events as EventEmitter
125
+ - Reconnect logic with exponential backoff (max 5 attempts)
126
+ - Per-session multiplexing via `sessionId` for multi-tab support
127
+
128
+ ### 2. Session Manager (`src/cdp/session.js`)
129
+
130
+ ```js
131
+ class SessionManager {
132
+ async listTargets(): Promise<Target[]>
133
+ async attachTarget(targetId: string): Promise<CDPClient>
134
+ async createTarget(url?: string): Promise<CDPClient>
135
+ async closeTarget(targetId: string): Promise<void>
136
+ async getActiveSession(): Promise<CDPClient>
137
+ }
138
+ ```
139
+
140
+ ### 3. CDP Domains
141
+
142
+ Each domain wraps CDP method calls into ergonomic async functions:
143
+
144
+ **Page domain:**
145
+ - `navigate(url)` → `Page.navigate`
146
+ - `screenshot(options?)` → `Page.captureScreenshot`
147
+ - `pdf(options?)` → `Page.printToPDF`
148
+ - `reload()` → `Page.reload`
149
+ - `waitForLoad()` → listen `Page.loadEventFired`
150
+ - `getContent()` → `Page.getResourceContent`
151
+ - `setDialogBehavior(action)` → `Page.handleJavaScriptDialog`
152
+
153
+ **DOM domain:**
154
+ - `querySelector(selector)` → `DOM.querySelector`
155
+ - `querySelectorAll(selector)` → `DOM.querySelectorAll`
156
+ - `getDocument()` → `DOM.getDocument`
157
+ - `getOuterHTML(nodeId)` → `DOM.getOuterHTML`
158
+ - `setAttributeValue(nodeId, name, value)` → `DOM.setAttributeValue`
159
+ - `focus(nodeId)` → `DOM.focus`
160
+ - `getBoxModel(nodeId)` → `DOM.getBoxModel`
161
+ - `scrollIntoView(nodeId)` → `DOM.scrollIntoViewIfNeeded`
162
+
163
+ **Input domain:**
164
+ - `click(x, y)` → `Input.dispatchMouseEvent` (move + down + up)
165
+ - `clickSelector(selector)` → resolve node center coords → click
166
+ - `type(text)` → `Input.dispatchKeyEvent` per char
167
+ - `keyPress(key)` → `Input.dispatchKeyEvent`
168
+ - `scroll(x, y, deltaX, deltaY)` → `Input.dispatchMouseEvent` scroll
169
+ - `hover(x, y)` → `Input.dispatchMouseEvent` mouseMoved
170
+
171
+ **Runtime domain:**
172
+ - `evaluate(expression)` → `Runtime.evaluate`
173
+ - `callFunctionOn(funcDecl, objectId, args)` → `Runtime.callFunctionOn`
174
+ - `getProperties(objectId)` → `Runtime.getProperties`
175
+
176
+ **Network domain:**
177
+ - `enable()` / `disable()`
178
+ - `setRequestInterception(patterns)` → `Fetch.enable`
179
+ - `continueRequest(requestId)` → `Fetch.continueRequest`
180
+ - `getResponseBody(requestId)` → `Network.getResponseBody`
181
+
182
+ ---
183
+
184
+ ## Agent Core
185
+
186
+ ### Tool Definitions (`src/agent/tools.js`)
187
+
188
+ The agent operates via tool calls. Tools map directly to CDP domain actions:
189
+
190
+ | Tool Name | Description | Key Params |
191
+ |----------------------|--------------------------------------------------|-----------------------------------|
192
+ | `navigate` | Navigate browser to URL | `url: string` |
193
+ | `screenshot` | Capture current viewport as base64 PNG | `fullPage?: boolean` |
194
+ | `click` | Click an element by CSS selector | `selector: string` |
195
+ | `click_coords` | Click at specific x,y coordinates | `x: number, y: number` |
196
+ | `type_text` | Type text into focused/selected element | `selector: string, text: string` |
197
+ | `key_press` | Press a keyboard key | `key: string` |
198
+ | `scroll` | Scroll page or element | `direction: up\|down, amount: number` |
199
+ | `hover` | Hover over element | `selector: string` |
200
+ | `get_page_content` | Get readable text of current page (DOM→text) | `format: text\|markdown\|html` |
201
+ | `evaluate_js` | Execute JavaScript in page context | `script: string` |
202
+ | `wait` | Wait N milliseconds | `ms: number` |
203
+ | `get_url` | Get current page URL | — |
204
+ | `new_tab` | Open a new browser tab | `url?: string` |
205
+ | `close_tab` | Close current or specified tab | `targetId?: string` |
206
+ | `switch_tab` | Switch active tab | `targetId: string` |
207
+ | `list_tabs` | List all open tabs | — |
208
+ | `go_back` | Browser back | — |
209
+ | `go_forward` | Browser forward | — |
210
+ | `get_element_info` | Get attributes/text of an element | `selector: string` |
211
+ | `select_option` | Select a `<select>` option by value/label | `selector: string, value: string` |
212
+ | `set_checkbox` | Check/uncheck checkbox | `selector: string, checked: bool` |
213
+ | `upload_file` | Set file input value | `selector: string, path: string` |
214
+ | `extract_data` | Extract structured data from page | `schema: object` |
215
+
216
+ ### Planner (`src/agent/planner.js`)
217
+
218
+ - Uses Anthropic Claude API (`claude-sonnet-4-6` default, configurable)
219
+ - System prompt defines: agent role, available tools, output format, safety constraints
220
+ - Receives: user task + screenshot + page content + action history
221
+ - Returns: next tool call(s) or `finish` with result
222
+ - Max steps: configurable (default 20)
223
+ - Step loop:
224
+ 1. Capture screenshot + page content
225
+ 2. Build context message (task, history, current state)
226
+ 3. Call LLM with tool definitions
227
+ 4. Execute returned tool call
228
+ 5. Append to history
229
+ 6. Repeat until `finish` or max steps
230
+
231
+ ### Memory (`src/agent/memory.js`)
232
+
233
+ - Rolling window of last N steps (default 10) to stay within context
234
+ - Each entry: `{ step, tool, params, result, screenshot_b64? }`
235
+ - Summarization: when history exceeds window, summarize oldest entries via LLM
236
+
237
+ ---
238
+
239
+ ## CLI (`bin/cli.js`)
240
+
241
+ ### Usage
242
+
243
+ ```bash
244
+ # One-shot task
245
+ cdp-agent run "go to github.com and star the lightpanda repo"
246
+
247
+ # Interactive REPL mode
248
+ cdp-agent repl
249
+
250
+ # Script mode (task file)
251
+ cdp-agent script ./tasks/my-task.json
252
+
253
+ # With custom browser endpoint
254
+ cdp-agent run "..." --cdp-url ws://localhost:9222
255
+
256
+ # With screenshot output
257
+ cdp-agent run "..." --screenshots ./output/
258
+
259
+ # Verbose CDP logging
260
+ cdp-agent run "..." --debug
261
+ ```
262
+
263
+ ### CLI Options
264
+
265
+ | Flag | Default | Description |
266
+ |---------------------|-----------------------|--------------------------------------|
267
+ | `--cdp-url` | `ws://localhost:9222` | CDP WebSocket endpoint |
268
+ | `--model` | `claude-sonnet-4-6` | Claude model to use |
269
+ | `--max-steps` | `20` | Max agent steps before stopping |
270
+ | `--screenshots` | `null` | Dir to save step screenshots |
271
+ | `--headless` | `true` | Launch browser in headless mode |
272
+ | `--launch` | `false` | Auto-launch Lightpanda before run |
273
+ | `--launch-path` | `lightpanda` | Path to Lightpanda binary |
274
+ | `--debug` | `false` | Log raw CDP messages |
275
+ | `--output` | `text` | Output format: `text`, `json` |
276
+
277
+ ### Interactive REPL Features
278
+
279
+ - Multi-line task input
280
+ - `!screenshot` command — capture and display screenshot
281
+ - `!tabs` — list open tabs
282
+ - `!history` — show action history
283
+ - `!clear` — reset agent memory
284
+ - `!exit` — quit
285
+ - Arrow key history navigation
286
+
287
+ ---
288
+
289
+ ## MCP Server (`bin/mcp.js`)
290
+
291
+ ### Transport
292
+
293
+ - **stdio** (primary) — for Claude Desktop, Cursor integration
294
+ - **HTTP/SSE** (optional) — for remote/network access
295
+
296
+ ### MCP Tools Exposed
297
+
298
+ Each agent tool is exposed as an MCP tool. Additionally:
299
+
300
+ | MCP Tool | Description |
301
+ |-----------------------|-------------------------------------------------|
302
+ | `browser_run_task` | Run a full natural language agent task |
303
+ | `browser_navigate` | Navigate to URL |
304
+ | `browser_screenshot` | Capture screenshot (returns base64 image) |
305
+ | `browser_click` | Click element by selector |
306
+ | `browser_type` | Type text into element |
307
+ | `browser_evaluate` | Execute JavaScript |
308
+ | `browser_get_content` | Get page content as text/markdown |
309
+ | `browser_extract` | Extract structured data from page |
310
+ | `browser_new_tab` | Open new tab |
311
+ | `browser_close_tab` | Close a tab |
312
+ | `browser_list_tabs` | List open tabs |
313
+
314
+ ### MCP Resources
315
+
316
+ | Resource URI | Description |
317
+ |---------------------------|----------------------------------------|
318
+ | `browser://screenshot` | Current viewport screenshot |
319
+ | `browser://page/content` | Current page text content |
320
+ | `browser://page/url` | Current URL |
321
+ | `browser://tabs` | List of open tabs as JSON |
322
+
323
+ ### Claude Desktop Config (`~/Library/Application Support/Claude/claude_desktop_config.json`)
324
+
325
+ ```json
326
+ {
327
+ "mcpServers": {
328
+ "cdp-browser-agent": {
329
+ "command": "node",
330
+ "args": ["/path/to/cdp-browser-agent/bin/mcp.js"],
331
+ "env": {
332
+ "ANTHROPIC_API_KEY": "sk-ant-...",
333
+ "CDP_URL": "ws://localhost:9222"
334
+ }
335
+ }
336
+ }
337
+ }
338
+ ```
339
+
340
+ ---
341
+
342
+ ## Configuration
343
+
344
+ ### Environment Variables (`.env`)
345
+
346
+ ```env
347
+ # Required
348
+ ANTHROPIC_API_KEY=sk-ant-...
349
+
350
+ # Browser
351
+ CDP_URL=ws://localhost:9222
352
+ CDP_LAUNCH_BROWSER=false
353
+ CDP_BROWSER_PATH=lightpanda
354
+ CDP_BROWSER_PORT=9222
355
+
356
+ # Agent
357
+ AGENT_MODEL=claude-sonnet-4-6
358
+ AGENT_MAX_STEPS=20
359
+ AGENT_MAX_TOKENS=4096
360
+ AGENT_INCLUDE_SCREENSHOT=true
361
+ AGENT_SCREENSHOT_QUALITY=80
362
+
363
+ # Logging
364
+ LOG_LEVEL=info
365
+ LOG_FORMAT=pretty
366
+ ```
367
+
368
+ ---
369
+
370
+ ## Dependencies
371
+
372
+ ```json
373
+ {
374
+ "dependencies": {
375
+ "ws": "^8.18.0",
376
+ "@anthropic-ai/sdk": "^0.36.0",
377
+ "@modelcontextprotocol/sdk": "^1.5.0",
378
+ "commander": "^12.0.0",
379
+ "dotenv": "^16.0.0",
380
+ "pino": "^9.0.0",
381
+ "pino-pretty": "^11.0.0",
382
+ "node-html-to-text": "^9.0.0",
383
+ "ora": "^8.0.0",
384
+ "chalk": "^5.3.0",
385
+ "readline": "builtin"
386
+ },
387
+ "devDependencies": {
388
+ "vitest": "^2.0.0",
389
+ "nock": "^13.0.0"
390
+ },
391
+ "engines": {
392
+ "node": ">=20.0.0"
393
+ }
394
+ }
395
+ ```
396
+
397
+ ---
398
+
399
+ ## Key Implementation Notes
400
+
401
+ ### CDP Connection Flow
402
+
403
+ ```
404
+ 1. GET http://localhost:9222/json/version → browser metadata
405
+ 2. GET http://localhost:9222/json/list → list targets (tabs)
406
+ 3. WS ws://localhost:9222/devtools/page/{targetId} → connect to tab
407
+ 4. Send: { id: 1, method: "Page.enable", params: {} }
408
+ 5. Send: { id: 2, method: "DOM.enable", params: {} }
409
+ 6. Send: { id: 3, method: "Runtime.enable", params: {} }
410
+ ```
411
+
412
+ ### Element Interaction Strategy
413
+
414
+ When clicking/typing by selector:
415
+ 1. `DOM.querySelector` to get `nodeId`
416
+ 2. `DOM.getBoxModel(nodeId)` to get element center coordinates
417
+ 3. `DOM.scrollIntoViewIfNeeded(nodeId)` to ensure visibility
418
+ 4. `Input.dispatchMouseEvent` with calculated center coords
419
+
420
+ ### Screenshot + Vision Loop
421
+
422
+ Each agent step:
423
+ 1. `Page.captureScreenshot` → base64 PNG
424
+ 2. Include as `image` content block in Claude message
425
+ 3. Also include DOM→text representation for accessibility tree fallback
426
+ 4. Claude uses both visual and text signals to decide next action
427
+
428
+ ### DOM to Readable Text
429
+
430
+ Convert raw DOM to a simplified representation for LLM:
431
+ - Preserve: links (href), buttons (text), inputs (type, placeholder, value), headings, lists, tables
432
+ - Strip: scripts, styles, hidden elements
433
+ - Add: `[BUTTON: text]`, `[INPUT: placeholder]`, `[LINK: text → href]` markers
434
+ - Truncate to ~8000 tokens
435
+
436
+ ### Error Handling
437
+
438
+ - CDP command timeout: 30s default, configurable per domain
439
+ - Selector not found: retry once after 1s wait, then report to agent
440
+ - Navigation errors: capture and include in agent context
441
+ - Browser disconnect: attempt reconnect, fail gracefully after 3 attempts
442
+
443
+ ---
444
+
445
+ ## Testing Strategy
446
+
447
+ ```
448
+ tests/
449
+ ├── cdp/
450
+ │ ├── client.test.js # Unit: WebSocket mock, message parsing
451
+ │ ├── page.test.js # Integration: real Lightpanda instance
452
+ │ └── input.test.js # Integration: click/type actions
453
+ ├── agent/
454
+ │ ├── planner.test.js # Unit: mock Claude API responses
455
+ │ ├── executor.test.js # Unit: tool execution
456
+ │ └── tools.test.js # Unit: tool schema validation
457
+ └── e2e/
458
+ ├── navigate.test.js # E2E: full navigate + screenshot
459
+ └── form.test.js # E2E: fill and submit form
460
+ ```
461
+
462
+ Run: `npx vitest`
463
+ E2E requires Lightpanda running: `npx vitest --project=e2e`
464
+
465
+ ---
466
+
467
+ ## Phase Plan
468
+
469
+ ### Phase 1 — CDP Foundation
470
+ - [ ] CDP WebSocket client with message handling
471
+ - [ ] Session/target manager
472
+ - [ ] Page, DOM, Input, Runtime domain wrappers
473
+ - [ ] Basic screenshot + DOM extraction utilities
474
+
475
+ ### Phase 2 — Agent Core
476
+ - [ ] Tool definitions + JSON schemas
477
+ - [ ] Claude API integration (tool use loop)
478
+ - [ ] Step history / memory
479
+ - [ ] System prompts
480
+
481
+ ### Phase 3 — CLI
482
+ - [ ] `run` command (one-shot)
483
+ - [ ] `repl` interactive mode
484
+ - [ ] `script` file mode
485
+ - [ ] Progress display with ora
486
+ - [ ] Screenshot saving
487
+
488
+ ### Phase 4 — MCP Server
489
+ - [ ] MCP SDK integration (stdio transport)
490
+ - [ ] Expose all browser tools
491
+ - [ ] Screenshot as MCP image resource
492
+ - [ ] Claude Desktop config docs
493
+
494
+ ### Phase 5 — Polish
495
+ - [ ] HTTP/SSE transport for MCP
496
+ - [ ] Auto-launch Lightpanda option
497
+ - [ ] Rate limiting + retry logic
498
+ - [ ] Structured logging
499
+ - [ ] E2E test suite
500
+
501
+ ---
502
+
503
+ ## Security Considerations
504
+
505
+ - Never expose MCP server on public network without auth
506
+ - Sandbox browser profile (no stored credentials/cookies by default)
507
+ - Configurable allowlist/blocklist for navigation URLs
508
+ - File upload limited to explicit user-provided paths
509
+ - `evaluate_js` tool should warn user — arbitrary JS execution
510
+ - No persistent browser storage by default (new profile each run)
Binary file