@porcupine/kuskus 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,123 @@
1
+ import { SessionManager } from '../cdp/session.js';
2
+ import { createPageDomain } from '../cdp/domains/page.js';
3
+ import { createRuntimeDomain } from '../cdp/domains/runtime.js';
4
+ import { Planner } from './planner.js';
5
+ import { Executor } from './executor.js';
6
+ import { AgentMemory } from './memory.js';
7
+ import { htmlToReadableText } from '../utils/dom-to-text.js';
8
+ import { saveScreenshot, screenshotFilename } from '../utils/screenshot.js';
9
+ import logger from '../utils/logger.js';
10
+
11
+ /**
12
+ * Kuskus Agent — orchestrates the plan → execute → observe loop.
13
+ */
14
+ export class KuskusAgent {
15
+ #session;
16
+ #planner;
17
+ #executor;
18
+ #memory;
19
+ #maxSteps;
20
+ #screenshotDir;
21
+ #onStep;
22
+
23
+ constructor({
24
+ cdpUrl = process.env.CDP_URL || 'ws://localhost:9222',
25
+ model = process.env.AGENT_MODEL || 'claude-sonnet-4-6',
26
+ maxSteps = Number(process.env.AGENT_MAX_STEPS) || 20,
27
+ maxTokens = Number(process.env.AGENT_MAX_TOKENS) || 4096,
28
+ includeScreenshot = process.env.AGENT_INCLUDE_SCREENSHOT !== 'false',
29
+ screenshotDir = null,
30
+ onStep = null,
31
+ } = {}) {
32
+ const url = new URL(cdpUrl);
33
+ this.#session = new SessionManager({ host: url.hostname, port: Number(url.port) || 9222 });
34
+ this.#planner = new Planner({ model, maxTokens, includeScreenshot });
35
+ this.#executor = new Executor(this.#session);
36
+ this.#memory = new AgentMemory({ windowSize: 10 });
37
+ this.#maxSteps = maxSteps;
38
+ this.#screenshotDir = screenshotDir;
39
+ this.#onStep = onStep;
40
+ }
41
+
42
+ async connect() {
43
+ await this.#session.connect();
44
+ return this;
45
+ }
46
+
47
+ async close() {
48
+ await this.#session.close();
49
+ }
50
+
51
+ /**
52
+ * Run a natural language task to completion.
53
+ * @param {string} task
54
+ * @returns {Promise<{ result: string, data?: any, steps: number }>}
55
+ */
56
+ async run(task) {
57
+ this.#memory.clear();
58
+ let step = 0;
59
+
60
+ for (;;) {
61
+ step++;
62
+ if (step > this.#maxSteps) {
63
+ return { result: `Stopped after ${this.#maxSteps} steps without completing.`, steps: step };
64
+ }
65
+
66
+ // Observe current state
67
+ const client = await this.#session.getActiveSession();
68
+ const page = createPageDomain(client);
69
+ const runtime = createRuntimeDomain(client);
70
+
71
+ const [currentUrl, screenshot, html] = await Promise.all([
72
+ page.getURL().catch(() => 'unknown'),
73
+ page.screenshot({ quality: Number(process.env.AGENT_SCREENSHOT_QUALITY) || 80 }).catch(() => null),
74
+ runtime.evaluate('document.documentElement.outerHTML').catch(() => ''),
75
+ ]);
76
+
77
+ const pageContent = htmlToReadableText(html);
78
+
79
+ // Save screenshot to disk if configured
80
+ if (screenshot && this.#screenshotDir) {
81
+ const filename = screenshotFilename(step);
82
+ await saveScreenshot(screenshot, this.#screenshotDir, filename);
83
+ }
84
+
85
+ // Plan next action
86
+ const { toolName, params } = await this.#planner.nextAction({
87
+ task,
88
+ step,
89
+ maxSteps: this.#maxSteps,
90
+ history: this.#memory.toContextString(),
91
+ screenshot,
92
+ pageContent,
93
+ currentUrl,
94
+ });
95
+
96
+ this.#onStep?.({ step, tool: toolName, params, url: currentUrl });
97
+
98
+ // Finish signal
99
+ if (toolName === 'finish') {
100
+ this.#memory.push({ step, tool: 'finish', params, result: params.result });
101
+ return { result: params.result, data: params.data, steps: step };
102
+ }
103
+
104
+ // Execute tool
105
+ let result, error;
106
+ try {
107
+ const raw = await this.#executor.execute(toolName, params);
108
+ // Unwrap screenshot objects — don't store full base64 in memory
109
+ if (raw && typeof raw === 'object' && raw.type === 'screenshot') {
110
+ result = '[screenshot captured]';
111
+ } else {
112
+ result = typeof raw === 'object' ? JSON.stringify(raw) : String(raw ?? '');
113
+ }
114
+ } catch (err) {
115
+ error = err.message;
116
+ result = `ERROR: ${err.message}`;
117
+ logger.warn({ step, tool: toolName, error: err.message }, 'Tool execution failed');
118
+ }
119
+
120
+ this.#memory.push({ step, tool: toolName, params, result, error });
121
+ }
122
+ }
123
+ }
@@ -0,0 +1,42 @@
1
+ /**
2
+ * Rolling short-term memory for agent step history.
3
+ * Keeps the last N steps in full, summarizes older ones if needed.
4
+ */
5
+ export class AgentMemory {
6
+ #steps = [];
7
+ #windowSize;
8
+
9
+ constructor({ windowSize = 10 } = {}) {
10
+ this.#windowSize = windowSize;
11
+ }
12
+
13
+ /**
14
+ * Add a completed step to memory.
15
+ * @param {{ step: number, tool: string, params: object, result: string, error?: string }} entry
16
+ */
17
+ push(entry) {
18
+ this.#steps.push(entry);
19
+ if (this.#steps.length > this.#windowSize) {
20
+ this.#steps.shift();
21
+ }
22
+ }
23
+
24
+ /**
25
+ * Return steps formatted as a string for inclusion in the LLM prompt.
26
+ */
27
+ toContextString() {
28
+ if (this.#steps.length === 0) return 'No previous actions.';
29
+ return this.#steps
30
+ .map((s) => {
31
+ const status = s.error ? `ERROR: ${s.error}` : `OK: ${String(s.result).slice(0, 300)}`;
32
+ return `Step ${s.step}: ${s.tool}(${JSON.stringify(s.params)}) → ${status}`;
33
+ })
34
+ .join('\n');
35
+ }
36
+
37
+ get length() { return this.#steps.length; }
38
+
39
+ clear() { this.#steps = []; }
40
+
41
+ getAll() { return [...this.#steps]; }
42
+ }
@@ -0,0 +1,85 @@
1
+ import Anthropic from '@anthropic-ai/sdk';
2
+ import { SYSTEM_PROMPT, TASK_CONTEXT_TEMPLATE } from './prompts.js';
3
+ import { TOOL_DEFINITIONS } from './tools.js';
4
+ import { htmlToReadableText } from '../utils/dom-to-text.js';
5
+ import logger from '../utils/logger.js';
6
+
7
+ /**
8
+ * Drives the agent planning loop using Claude tool use.
9
+ */
10
+ export class Planner {
11
+ #client;
12
+ #model;
13
+ #maxTokens;
14
+ #includeScreenshot;
15
+
16
+ constructor({
17
+ apiKey,
18
+ model = 'claude-sonnet-4-6',
19
+ maxTokens = 4096,
20
+ includeScreenshot = true,
21
+ } = {}) {
22
+ this.#client = new Anthropic({ apiKey: apiKey || process.env.ANTHROPIC_API_KEY });
23
+ this.#model = model;
24
+ this.#maxTokens = maxTokens;
25
+ this.#includeScreenshot = includeScreenshot;
26
+ }
27
+
28
+ /**
29
+ * Ask Claude for the next tool call given the current browser state.
30
+ *
31
+ * @param {object} opts
32
+ * @param {string} opts.task - original user task
33
+ * @param {number} opts.step - current step number
34
+ * @param {number} opts.maxSteps
35
+ * @param {string} opts.history - stringified action history
36
+ * @param {string|null} opts.screenshot - base64 PNG or null
37
+ * @param {string} opts.pageContent - readable page text
38
+ * @param {string} opts.currentUrl
39
+ * @returns {Promise<{ toolName: string, params: object }>}
40
+ */
41
+ async nextAction({ task, step, maxSteps, history, screenshot, pageContent, currentUrl }) {
42
+ const userContent = [];
43
+
44
+ if (this.#includeScreenshot && screenshot) {
45
+ userContent.push({
46
+ type: 'image',
47
+ source: { type: 'base64', media_type: 'image/png', data: screenshot },
48
+ });
49
+ }
50
+
51
+ const contextText = [
52
+ TASK_CONTEXT_TEMPLATE(task, step, maxSteps),
53
+ `Current URL: ${currentUrl}`,
54
+ '',
55
+ '## Previous Actions',
56
+ history,
57
+ '',
58
+ '## Current Page Content',
59
+ pageContent.slice(0, 6000),
60
+ ].join('\n');
61
+
62
+ userContent.push({ type: 'text', text: contextText });
63
+
64
+ logger.debug({ step, url: currentUrl }, 'Calling Claude for next action');
65
+
66
+ const response = await this.#client.messages.create({
67
+ model: this.#model,
68
+ max_tokens: this.#maxTokens,
69
+ system: SYSTEM_PROMPT,
70
+ tools: TOOL_DEFINITIONS,
71
+ tool_choice: { type: 'any' },
72
+ messages: [{ role: 'user', content: userContent }],
73
+ });
74
+
75
+ const toolUse = response.content.find((b) => b.type === 'tool_use');
76
+ if (!toolUse) {
77
+ // Claude returned text only — treat as finish
78
+ const textBlock = response.content.find((b) => b.type === 'text');
79
+ return { toolName: 'finish', params: { result: textBlock?.text || 'Task complete.' } };
80
+ }
81
+
82
+ logger.debug({ tool: toolUse.name, params: toolUse.input }, 'Claude chose tool');
83
+ return { toolName: toolUse.name, params: toolUse.input };
84
+ }
85
+ }
@@ -0,0 +1,26 @@
1
+ export const SYSTEM_PROMPT = `You are Kuskus, an AI browser agent that controls a web browser using CDP (Chrome DevTools Protocol).
2
+
3
+ You will be given a task to complete. You have access to tools that let you interact with the browser: navigate, click, type, scroll, take screenshots, run JavaScript, and more.
4
+
5
+ ## Rules
6
+ - Complete the task step by step. Take one action at a time.
7
+ - Always take a screenshot or get page content to understand the current state before acting.
8
+ - When clicking elements, prefer using CSS selectors over coordinates.
9
+ - If an action fails (element not found, navigation error), adapt your approach.
10
+ - Do not loop forever — if stuck after 3 attempts at the same action, report what you found and stop.
11
+ - When the task is complete, call the \`finish\` tool with a clear summary of what was accomplished.
12
+ - Keep your reasoning concise — focus on what to do next.
13
+
14
+ ## Element Selection Tips
15
+ - Prefer specific selectors: \`button[type="submit"]\`, \`input[name="q"]\`, \`#login-btn\`
16
+ - For links: \`a[href*="keyword"]\` or find by visible text via JS
17
+ - For forms: target \`name\` or \`id\` attributes
18
+ - If selector fails, use \`evaluate_js\` to inspect the DOM
19
+
20
+ ## Safety
21
+ - Do not submit forms or make purchases unless explicitly asked.
22
+ - Do not enter real credentials — use placeholder values unless the user provides them.
23
+ - If you see a CAPTCHA, report it and stop.`;
24
+
25
+ export const TASK_CONTEXT_TEMPLATE = (task, step, maxSteps) =>
26
+ `Task: ${task}\nStep: ${step}/${maxSteps}`;
@@ -0,0 +1,242 @@
1
+ /**
2
+ * Tool definitions for the Kuskus agent.
3
+ * Each entry has: name, description, input_schema (JSON Schema), and a handler factory.
4
+ */
5
+
6
+ export const TOOL_DEFINITIONS = [
7
+ {
8
+ name: 'navigate',
9
+ description: 'Navigate the browser to a URL.',
10
+ input_schema: {
11
+ type: 'object',
12
+ properties: {
13
+ url: { type: 'string', description: 'The URL to navigate to.' },
14
+ },
15
+ required: ['url'],
16
+ },
17
+ },
18
+ {
19
+ name: 'screenshot',
20
+ description: 'Capture the current browser viewport as a PNG screenshot. Returns base64 image.',
21
+ input_schema: {
22
+ type: 'object',
23
+ properties: {
24
+ full_page: { type: 'boolean', description: 'Capture the full scrollable page.' },
25
+ },
26
+ },
27
+ },
28
+ {
29
+ name: 'get_page_content',
30
+ description: 'Get the readable text content of the current page (HTML converted to text/markdown).',
31
+ input_schema: {
32
+ type: 'object',
33
+ properties: {
34
+ format: { type: 'string', enum: ['text', 'html'], description: 'Output format.' },
35
+ },
36
+ },
37
+ },
38
+ {
39
+ name: 'get_url',
40
+ description: 'Get the current page URL.',
41
+ input_schema: { type: 'object', properties: {} },
42
+ },
43
+ {
44
+ name: 'click',
45
+ description: 'Click an element on the page using a CSS selector.',
46
+ input_schema: {
47
+ type: 'object',
48
+ properties: {
49
+ selector: { type: 'string', description: 'CSS selector of the element to click.' },
50
+ },
51
+ required: ['selector'],
52
+ },
53
+ },
54
+ {
55
+ name: 'click_coords',
56
+ description: 'Click at specific x, y coordinates on the page.',
57
+ input_schema: {
58
+ type: 'object',
59
+ properties: {
60
+ x: { type: 'number' },
61
+ y: { type: 'number' },
62
+ },
63
+ required: ['x', 'y'],
64
+ },
65
+ },
66
+ {
67
+ name: 'type_text',
68
+ description: 'Click an input element and type text into it. Clears existing content first.',
69
+ input_schema: {
70
+ type: 'object',
71
+ properties: {
72
+ selector: { type: 'string', description: 'CSS selector of the input/textarea.' },
73
+ text: { type: 'string', description: 'Text to type.' },
74
+ clear_first: { type: 'boolean', description: 'Select all and delete before typing. Default true.' },
75
+ },
76
+ required: ['selector', 'text'],
77
+ },
78
+ },
79
+ {
80
+ name: 'key_press',
81
+ description: 'Press a keyboard key (Enter, Tab, Escape, ArrowDown, etc.).',
82
+ input_schema: {
83
+ type: 'object',
84
+ properties: {
85
+ key: { type: 'string', description: 'DOM key name e.g. Enter, Tab, Escape.' },
86
+ },
87
+ required: ['key'],
88
+ },
89
+ },
90
+ {
91
+ name: 'scroll',
92
+ description: 'Scroll the page up or down.',
93
+ input_schema: {
94
+ type: 'object',
95
+ properties: {
96
+ direction: { type: 'string', enum: ['up', 'down'], description: 'Scroll direction.' },
97
+ amount: { type: 'number', description: 'Pixels to scroll. Default 500.' },
98
+ },
99
+ required: ['direction'],
100
+ },
101
+ },
102
+ {
103
+ name: 'hover',
104
+ description: 'Hover over an element by CSS selector.',
105
+ input_schema: {
106
+ type: 'object',
107
+ properties: {
108
+ selector: { type: 'string' },
109
+ },
110
+ required: ['selector'],
111
+ },
112
+ },
113
+ {
114
+ name: 'evaluate_js',
115
+ description: 'Execute JavaScript in the page context and return the result. Use for complex DOM queries or interactions.',
116
+ input_schema: {
117
+ type: 'object',
118
+ properties: {
119
+ script: { type: 'string', description: 'JavaScript expression or statement to evaluate.' },
120
+ },
121
+ required: ['script'],
122
+ },
123
+ },
124
+ {
125
+ name: 'wait',
126
+ description: 'Wait for a specified number of milliseconds.',
127
+ input_schema: {
128
+ type: 'object',
129
+ properties: {
130
+ ms: { type: 'number', description: 'Milliseconds to wait (max 10000).' },
131
+ },
132
+ required: ['ms'],
133
+ },
134
+ },
135
+ {
136
+ name: 'get_element_info',
137
+ description: 'Get attributes and text content of an element.',
138
+ input_schema: {
139
+ type: 'object',
140
+ properties: {
141
+ selector: { type: 'string' },
142
+ },
143
+ required: ['selector'],
144
+ },
145
+ },
146
+ {
147
+ name: 'select_option',
148
+ description: 'Select an option in a <select> element.',
149
+ input_schema: {
150
+ type: 'object',
151
+ properties: {
152
+ selector: { type: 'string' },
153
+ value: { type: 'string', description: 'Option value or visible label text.' },
154
+ },
155
+ required: ['selector', 'value'],
156
+ },
157
+ },
158
+ {
159
+ name: 'set_checkbox',
160
+ description: 'Check or uncheck a checkbox or radio input.',
161
+ input_schema: {
162
+ type: 'object',
163
+ properties: {
164
+ selector: { type: 'string' },
165
+ checked: { type: 'boolean' },
166
+ },
167
+ required: ['selector', 'checked'],
168
+ },
169
+ },
170
+ {
171
+ name: 'go_back',
172
+ description: 'Navigate back in browser history.',
173
+ input_schema: { type: 'object', properties: {} },
174
+ },
175
+ {
176
+ name: 'go_forward',
177
+ description: 'Navigate forward in browser history.',
178
+ input_schema: { type: 'object', properties: {} },
179
+ },
180
+ {
181
+ name: 'new_tab',
182
+ description: 'Open a new browser tab.',
183
+ input_schema: {
184
+ type: 'object',
185
+ properties: {
186
+ url: { type: 'string', description: 'URL to open in the new tab.' },
187
+ },
188
+ },
189
+ },
190
+ {
191
+ name: 'list_tabs',
192
+ description: 'List all open browser tabs.',
193
+ input_schema: { type: 'object', properties: {} },
194
+ },
195
+ {
196
+ name: 'switch_tab',
197
+ description: 'Switch to a different browser tab.',
198
+ input_schema: {
199
+ type: 'object',
200
+ properties: {
201
+ target_id: { type: 'string', description: 'Target ID from list_tabs.' },
202
+ },
203
+ required: ['target_id'],
204
+ },
205
+ },
206
+ {
207
+ name: 'close_tab',
208
+ description: 'Close a browser tab.',
209
+ input_schema: {
210
+ type: 'object',
211
+ properties: {
212
+ target_id: { type: 'string', description: 'Target ID to close. Closes active tab if omitted.' },
213
+ },
214
+ },
215
+ },
216
+ {
217
+ name: 'extract_data',
218
+ description: 'Extract structured data from the current page using a JavaScript extractor expression.',
219
+ input_schema: {
220
+ type: 'object',
221
+ properties: {
222
+ script: {
223
+ type: 'string',
224
+ description: 'JS expression returning an object/array with the data you want to extract.',
225
+ },
226
+ },
227
+ required: ['script'],
228
+ },
229
+ },
230
+ {
231
+ name: 'finish',
232
+ description: 'Signal that the task is complete. Provide a summary of what was accomplished.',
233
+ input_schema: {
234
+ type: 'object',
235
+ properties: {
236
+ result: { type: 'string', description: 'Summary of the completed task and any results.' },
237
+ data: { description: 'Optional structured data returned from the task (any type).' },
238
+ },
239
+ required: ['result'],
240
+ },
241
+ },
242
+ ];
@@ -0,0 +1,123 @@
1
+ import { EventEmitter } from 'events';
2
+ import WebSocket from 'ws';
3
+ import logger from '../utils/logger.js';
4
+
5
+ const DEFAULT_TIMEOUT = 30_000;
6
+
7
+ /**
8
+ * Low-level CDP client over WebSocket.
9
+ * Supports both browser-level and page-level (session) connections.
10
+ */
11
+ export class CDPClient extends EventEmitter {
12
+ #ws = null;
13
+ #pending = new Map(); // id → { resolve, reject, timer }
14
+ #nextId = 1;
15
+ #sessionId = null;
16
+
17
+ constructor({ url, sessionId = null } = {}) {
18
+ super();
19
+ this.url = url;
20
+ this.#sessionId = sessionId;
21
+ }
22
+
23
+ get sessionId() { return this.#sessionId; }
24
+
25
+ async connect() {
26
+ return new Promise((resolve, reject) => {
27
+ const ws = new WebSocket(this.url);
28
+ this.#ws = ws;
29
+
30
+ ws.once('open', () => {
31
+ logger.debug({ url: this.url }, 'CDP WebSocket connected');
32
+ resolve();
33
+ });
34
+
35
+ ws.once('error', (err) => {
36
+ reject(err);
37
+ });
38
+
39
+ ws.on('message', (data) => {
40
+ this.#onMessage(JSON.parse(data.toString()));
41
+ });
42
+
43
+ ws.on('close', (code, reason) => {
44
+ logger.debug({ code }, 'CDP WebSocket closed');
45
+ this.emit('disconnected', { code, reason: reason.toString() });
46
+ // Reject all pending
47
+ for (const [id, { reject: rej, timer }] of this.#pending) {
48
+ clearTimeout(timer);
49
+ rej(new Error(`CDP connection closed (code ${code})`));
50
+ this.#pending.delete(id);
51
+ }
52
+ });
53
+ });
54
+ }
55
+
56
+ #onMessage(msg) {
57
+ // Flat event (browser-level)
58
+ if (msg.method) {
59
+ const event = this.#sessionId ? `${this.#sessionId}.${msg.method}` : msg.method;
60
+ this.emit(msg.method, msg.params);
61
+ this.emit(event, msg.params);
62
+ return;
63
+ }
64
+
65
+ // Session-multiplexed event
66
+ if (msg.sessionId && msg.method === undefined && msg.id === undefined) {
67
+ this.emit(`session.${msg.sessionId}`, msg);
68
+ return;
69
+ }
70
+
71
+ // Response to a command
72
+ if (msg.id !== undefined) {
73
+ const entry = this.#pending.get(msg.id);
74
+ if (!entry) return;
75
+ clearTimeout(entry.timer);
76
+ this.#pending.delete(msg.id);
77
+ if (msg.error) {
78
+ const err = new Error(msg.error.message || 'CDP error');
79
+ err.code = msg.error.code;
80
+ err.data = msg.error.data;
81
+ entry.reject(err);
82
+ } else {
83
+ entry.resolve(msg.result ?? {});
84
+ }
85
+ }
86
+ }
87
+
88
+ /**
89
+ * Send a CDP command and return the result.
90
+ * @param {string} method
91
+ * @param {object} [params]
92
+ * @param {number} [timeout]
93
+ * @returns {Promise<object>}
94
+ */
95
+ send(method, params = {}, timeout = DEFAULT_TIMEOUT) {
96
+ return new Promise((resolve, reject) => {
97
+ if (!this.#ws || this.#ws.readyState !== WebSocket.OPEN) {
98
+ return reject(new Error('CDP WebSocket is not open'));
99
+ }
100
+
101
+ const id = this.#nextId++;
102
+ const msg = { id, method, params };
103
+ if (this.#sessionId) msg.sessionId = this.#sessionId;
104
+
105
+ const timer = setTimeout(() => {
106
+ this.#pending.delete(id);
107
+ reject(new Error(`CDP command timed out: ${method}`));
108
+ }, timeout);
109
+
110
+ this.#pending.set(id, { resolve, reject, timer });
111
+
112
+ logger.debug({ method, params }, 'CDP →');
113
+ this.#ws.send(JSON.stringify(msg));
114
+ });
115
+ }
116
+
117
+ async close() {
118
+ if (this.#ws) {
119
+ this.#ws.close();
120
+ this.#ws = null;
121
+ }
122
+ }
123
+ }