npm - @porcupine/kuskus - Versions diffs - 0.1.0 - Mend

@porcupine/kuskus 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

package/.claude/settings.local.json +9 -0
package/.env.example +19 -0
package/SPEC.md +510 -0
package/assets/logo.png +0 -0
package/bin/cli.js +313 -0
package/bin/mcp.js +33 -0
package/examples/claude-desktop-config.json +15 -0
package/examples/fill-form.js +23 -0
package/examples/search-google.js +28 -0
package/package.json +37 -0
package/src/agent/executor.js +213 -0
package/src/agent/index.js +123 -0
package/src/agent/memory.js +42 -0
package/src/agent/planner.js +85 -0
package/src/agent/prompts.js +26 -0
package/src/agent/tools.js +242 -0
package/src/cdp/client.js +123 -0
package/src/cdp/domains/dom.js +85 -0
package/src/cdp/domains/input.js +84 -0
package/src/cdp/domains/network.js +75 -0
package/src/cdp/domains/page.js +80 -0
package/src/cdp/domains/runtime.js +66 -0
package/src/cdp/domains/target.js +28 -0
package/src/cdp/session.js +142 -0
package/src/mcp/handlers.js +99 -0
package/src/mcp/server.js +192 -0
package/src/utils/browser.js +86 -0
package/src/utils/dom-to-text.js +78 -0
package/src/utils/install.js +138 -0
package/src/utils/logger.js +16 -0
package/src/utils/screenshot.js +26 -0
package/tests/agent/memory.test.js +37 -0
package/tests/agent/tools.test.js +24 -0
package/tests/cdp/client.test.js +52 -0
package/vitest.config.js +19 -0

package/src/agent/index.js ADDED Viewed

@@ -0,0 +1,123 @@
+import { SessionManager } from '../cdp/session.js';
+import { createPageDomain } from '../cdp/domains/page.js';
+import { createRuntimeDomain } from '../cdp/domains/runtime.js';
+import { Planner } from './planner.js';
+import { Executor } from './executor.js';
+import { AgentMemory } from './memory.js';
+import { htmlToReadableText } from '../utils/dom-to-text.js';
+import { saveScreenshot, screenshotFilename } from '../utils/screenshot.js';
+import logger from '../utils/logger.js';
+/**
+ * Kuskus Agent — orchestrates the plan → execute → observe loop.
+ */
+export class KuskusAgent {
+  #session;
+  #planner;
+  #executor;
+  #memory;
+  #maxSteps;
+  #screenshotDir;
+  #onStep;
+  constructor({
+    cdpUrl = process.env.CDP_URL || 'ws://localhost:9222',
+    model = process.env.AGENT_MODEL || 'claude-sonnet-4-6',
+    maxSteps = Number(process.env.AGENT_MAX_STEPS) || 20,
+    maxTokens = Number(process.env.AGENT_MAX_TOKENS) || 4096,
+    includeScreenshot = process.env.AGENT_INCLUDE_SCREENSHOT !== 'false',
+    screenshotDir = null,
+    onStep = null,
+  } = {}) {
+    const url = new URL(cdpUrl);
+    this.#session = new SessionManager({ host: url.hostname, port: Number(url.port) || 9222 });
+    this.#planner = new Planner({ model, maxTokens, includeScreenshot });
+    this.#executor = new Executor(this.#session);
+    this.#memory = new AgentMemory({ windowSize: 10 });
+    this.#maxSteps = maxSteps;
+    this.#screenshotDir = screenshotDir;
+    this.#onStep = onStep;
+  }
+  async connect() {
+    await this.#session.connect();
+    return this;
+  }
+  async close() {
+    await this.#session.close();
+  }
+  /**
+   * Run a natural language task to completion.
+   * @param {string} task
+   * @returns {Promise<{ result: string, data?: any, steps: number }>}
+   */
+  async run(task) {
+    this.#memory.clear();
+    let step = 0;
+    for (;;) {
+      step++;
+      if (step > this.#maxSteps) {
+        return { result: `Stopped after ${this.#maxSteps} steps without completing.`, steps: step };
+      }
+      // Observe current state
+      const client = await this.#session.getActiveSession();
+      const page = createPageDomain(client);
+      const runtime = createRuntimeDomain(client);
+      const [currentUrl, screenshot, html] = await Promise.all([
+        page.getURL().catch(() => 'unknown'),
+        page.screenshot({ quality: Number(process.env.AGENT_SCREENSHOT_QUALITY) || 80 }).catch(() => null),
+        runtime.evaluate('document.documentElement.outerHTML').catch(() => ''),
+      ]);
+      const pageContent = htmlToReadableText(html);
+      // Save screenshot to disk if configured
+      if (screenshot && this.#screenshotDir) {
+        const filename = screenshotFilename(step);
+        await saveScreenshot(screenshot, this.#screenshotDir, filename);
+      }
+      // Plan next action
+      const { toolName, params } = await this.#planner.nextAction({
+        task,
+        step,
+        maxSteps: this.#maxSteps,
+        history: this.#memory.toContextString(),
+        screenshot,
+        pageContent,
+        currentUrl,
+      });
+      this.#onStep?.({ step, tool: toolName, params, url: currentUrl });
+      // Finish signal
+      if (toolName === 'finish') {
+        this.#memory.push({ step, tool: 'finish', params, result: params.result });
+        return { result: params.result, data: params.data, steps: step };
+      }
+      // Execute tool
+      let result, error;
+      try {
+        const raw = await this.#executor.execute(toolName, params);
+        // Unwrap screenshot objects — don't store full base64 in memory
+        if (raw && typeof raw === 'object' && raw.type === 'screenshot') {
+          result = '[screenshot captured]';
+        } else {
+          result = typeof raw === 'object' ? JSON.stringify(raw) : String(raw ?? '');
+        }
+      } catch (err) {
+        error = err.message;
+        result = `ERROR: ${err.message}`;
+        logger.warn({ step, tool: toolName, error: err.message }, 'Tool execution failed');
+      }
+      this.#memory.push({ step, tool: toolName, params, result, error });
+    }
+  }
+}

package/src/agent/memory.js ADDED Viewed

@@ -0,0 +1,42 @@
+/**
+ * Rolling short-term memory for agent step history.
+ * Keeps the last N steps in full, summarizes older ones if needed.
+ */
+export class AgentMemory {
+  #steps = [];
+  #windowSize;
+  constructor({ windowSize = 10 } = {}) {
+    this.#windowSize = windowSize;
+  }
+  /**
+   * Add a completed step to memory.
+   * @param {{ step: number, tool: string, params: object, result: string, error?: string }} entry
+   */
+  push(entry) {
+    this.#steps.push(entry);
+    if (this.#steps.length > this.#windowSize) {
+      this.#steps.shift();
+    }
+  }
+  /**
+   * Return steps formatted as a string for inclusion in the LLM prompt.
+   */
+  toContextString() {
+    if (this.#steps.length === 0) return 'No previous actions.';
+    return this.#steps
+      .map((s) => {
+        const status = s.error ? `ERROR: ${s.error}` : `OK: ${String(s.result).slice(0, 300)}`;
+        return `Step ${s.step}: ${s.tool}(${JSON.stringify(s.params)}) → ${status}`;
+      })
+      .join('\n');
+  }
+  get length() { return this.#steps.length; }
+  clear() { this.#steps = []; }
+  getAll() { return [...this.#steps]; }
+}

package/src/agent/planner.js ADDED Viewed

@@ -0,0 +1,85 @@
+import Anthropic from '@anthropic-ai/sdk';
+import { SYSTEM_PROMPT, TASK_CONTEXT_TEMPLATE } from './prompts.js';
+import { TOOL_DEFINITIONS } from './tools.js';
+import { htmlToReadableText } from '../utils/dom-to-text.js';
+import logger from '../utils/logger.js';
+/**
+ * Drives the agent planning loop using Claude tool use.
+ */
+export class Planner {
+  #client;
+  #model;
+  #maxTokens;
+  #includeScreenshot;
+  constructor({
+    apiKey,
+    model = 'claude-sonnet-4-6',
+    maxTokens = 4096,
+    includeScreenshot = true,
+  } = {}) {
+    this.#client = new Anthropic({ apiKey: apiKey || process.env.ANTHROPIC_API_KEY });
+    this.#model = model;
+    this.#maxTokens = maxTokens;
+    this.#includeScreenshot = includeScreenshot;
+  }
+  /**
+   * Ask Claude for the next tool call given the current browser state.
+   *
+   * @param {object} opts
+   * @param {string} opts.task          - original user task
+   * @param {number} opts.step          - current step number
+   * @param {number} opts.maxSteps
+   * @param {string} opts.history       - stringified action history
+   * @param {string|null} opts.screenshot   - base64 PNG or null
+   * @param {string} opts.pageContent   - readable page text
+   * @param {string} opts.currentUrl
+   * @returns {Promise<{ toolName: string, params: object }>}
+   */
+  async nextAction({ task, step, maxSteps, history, screenshot, pageContent, currentUrl }) {
+    const userContent = [];
+    if (this.#includeScreenshot && screenshot) {
+      userContent.push({
+        type: 'image',
+        source: { type: 'base64', media_type: 'image/png', data: screenshot },
+      });
+    }
+    const contextText = [
+      TASK_CONTEXT_TEMPLATE(task, step, maxSteps),
+      `Current URL: ${currentUrl}`,
+      '',
+      '## Previous Actions',
+      history,
+      '',
+      '## Current Page Content',
+      pageContent.slice(0, 6000),
+    ].join('\n');
+    userContent.push({ type: 'text', text: contextText });
+    logger.debug({ step, url: currentUrl }, 'Calling Claude for next action');
+    const response = await this.#client.messages.create({
+      model: this.#model,
+      max_tokens: this.#maxTokens,
+      system: SYSTEM_PROMPT,
+      tools: TOOL_DEFINITIONS,
+      tool_choice: { type: 'any' },
+      messages: [{ role: 'user', content: userContent }],
+    });
+    const toolUse = response.content.find((b) => b.type === 'tool_use');
+    if (!toolUse) {
+      // Claude returned text only — treat as finish
+      const textBlock = response.content.find((b) => b.type === 'text');
+      return { toolName: 'finish', params: { result: textBlock?.text || 'Task complete.' } };
+    }
+    logger.debug({ tool: toolUse.name, params: toolUse.input }, 'Claude chose tool');
+    return { toolName: toolUse.name, params: toolUse.input };
+  }
+}

package/src/agent/prompts.js ADDED Viewed

@@ -0,0 +1,26 @@
+export const SYSTEM_PROMPT = `You are Kuskus, an AI browser agent that controls a web browser using CDP (Chrome DevTools Protocol).
+You will be given a task to complete. You have access to tools that let you interact with the browser: navigate, click, type, scroll, take screenshots, run JavaScript, and more.
+## Rules
+- Complete the task step by step. Take one action at a time.
+- Always take a screenshot or get page content to understand the current state before acting.
+- When clicking elements, prefer using CSS selectors over coordinates.
+- If an action fails (element not found, navigation error), adapt your approach.
+- Do not loop forever — if stuck after 3 attempts at the same action, report what you found and stop.
+- When the task is complete, call the \`finish\` tool with a clear summary of what was accomplished.
+- Keep your reasoning concise — focus on what to do next.
+## Element Selection Tips
+- Prefer specific selectors: \`button[type="submit"]\`, \`input[name="q"]\`, \`#login-btn\`
+- For links: \`a[href*="keyword"]\` or find by visible text via JS
+- For forms: target \`name\` or \`id\` attributes
+- If selector fails, use \`evaluate_js\` to inspect the DOM
+## Safety
+- Do not submit forms or make purchases unless explicitly asked.
+- Do not enter real credentials — use placeholder values unless the user provides them.
+- If you see a CAPTCHA, report it and stop.`;
+export const TASK_CONTEXT_TEMPLATE = (task, step, maxSteps) =>
+  `Task: ${task}\nStep: ${step}/${maxSteps}`;

package/src/agent/tools.js ADDED Viewed

@@ -0,0 +1,242 @@
+/**
+ * Tool definitions for the Kuskus agent.
+ * Each entry has: name, description, input_schema (JSON Schema), and a handler factory.
+ */
+export const TOOL_DEFINITIONS = [
+  {
+    name: 'navigate',
+    description: 'Navigate the browser to a URL.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        url: { type: 'string', description: 'The URL to navigate to.' },
+      },
+      required: ['url'],
+    },
+  },
+  {
+    name: 'screenshot',
+    description: 'Capture the current browser viewport as a PNG screenshot. Returns base64 image.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        full_page: { type: 'boolean', description: 'Capture the full scrollable page.' },
+      },
+    },
+  },
+  {
+    name: 'get_page_content',
+    description: 'Get the readable text content of the current page (HTML converted to text/markdown).',
+    input_schema: {
+      type: 'object',
+      properties: {
+        format: { type: 'string', enum: ['text', 'html'], description: 'Output format.' },
+      },
+    },
+  },
+  {
+    name: 'get_url',
+    description: 'Get the current page URL.',
+    input_schema: { type: 'object', properties: {} },
+  },
+  {
+    name: 'click',
+    description: 'Click an element on the page using a CSS selector.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        selector: { type: 'string', description: 'CSS selector of the element to click.' },
+      },
+      required: ['selector'],
+    },
+  },
+  {
+    name: 'click_coords',
+    description: 'Click at specific x, y coordinates on the page.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        x: { type: 'number' },
+        y: { type: 'number' },
+      },
+      required: ['x', 'y'],
+    },
+  },
+  {
+    name: 'type_text',
+    description: 'Click an input element and type text into it. Clears existing content first.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        selector: { type: 'string', description: 'CSS selector of the input/textarea.' },
+        text: { type: 'string', description: 'Text to type.' },
+        clear_first: { type: 'boolean', description: 'Select all and delete before typing. Default true.' },
+      },
+      required: ['selector', 'text'],
+    },
+  },
+  {
+    name: 'key_press',
+    description: 'Press a keyboard key (Enter, Tab, Escape, ArrowDown, etc.).',
+    input_schema: {
+      type: 'object',
+      properties: {
+        key: { type: 'string', description: 'DOM key name e.g. Enter, Tab, Escape.' },
+      },
+      required: ['key'],
+    },
+  },
+  {
+    name: 'scroll',
+    description: 'Scroll the page up or down.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        direction: { type: 'string', enum: ['up', 'down'], description: 'Scroll direction.' },
+        amount: { type: 'number', description: 'Pixels to scroll. Default 500.' },
+      },
+      required: ['direction'],
+    },
+  },
+  {
+    name: 'hover',
+    description: 'Hover over an element by CSS selector.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        selector: { type: 'string' },
+      },
+      required: ['selector'],
+    },
+  },
+  {
+    name: 'evaluate_js',
+    description: 'Execute JavaScript in the page context and return the result. Use for complex DOM queries or interactions.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        script: { type: 'string', description: 'JavaScript expression or statement to evaluate.' },
+      },
+      required: ['script'],
+    },
+  },
+  {
+    name: 'wait',
+    description: 'Wait for a specified number of milliseconds.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        ms: { type: 'number', description: 'Milliseconds to wait (max 10000).' },
+      },
+      required: ['ms'],
+    },
+  },
+  {
+    name: 'get_element_info',
+    description: 'Get attributes and text content of an element.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        selector: { type: 'string' },
+      },
+      required: ['selector'],
+    },
+  },
+  {
+    name: 'select_option',
+    description: 'Select an option in a <select> element.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        selector: { type: 'string' },
+        value: { type: 'string', description: 'Option value or visible label text.' },
+      },
+      required: ['selector', 'value'],
+    },
+  },
+  {
+    name: 'set_checkbox',
+    description: 'Check or uncheck a checkbox or radio input.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        selector: { type: 'string' },
+        checked: { type: 'boolean' },
+      },
+      required: ['selector', 'checked'],
+    },
+  },
+  {
+    name: 'go_back',
+    description: 'Navigate back in browser history.',
+    input_schema: { type: 'object', properties: {} },
+  },
+  {
+    name: 'go_forward',
+    description: 'Navigate forward in browser history.',
+    input_schema: { type: 'object', properties: {} },
+  },
+  {
+    name: 'new_tab',
+    description: 'Open a new browser tab.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        url: { type: 'string', description: 'URL to open in the new tab.' },
+      },
+    },
+  },
+  {
+    name: 'list_tabs',
+    description: 'List all open browser tabs.',
+    input_schema: { type: 'object', properties: {} },
+  },
+  {
+    name: 'switch_tab',
+    description: 'Switch to a different browser tab.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        target_id: { type: 'string', description: 'Target ID from list_tabs.' },
+      },
+      required: ['target_id'],
+    },
+  },
+  {
+    name: 'close_tab',
+    description: 'Close a browser tab.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        target_id: { type: 'string', description: 'Target ID to close. Closes active tab if omitted.' },
+      },
+    },
+  },
+  {
+    name: 'extract_data',
+    description: 'Extract structured data from the current page using a JavaScript extractor expression.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        script: {
+          type: 'string',
+          description: 'JS expression returning an object/array with the data you want to extract.',
+        },
+      },
+      required: ['script'],
+    },
+  },
+  {
+    name: 'finish',
+    description: 'Signal that the task is complete. Provide a summary of what was accomplished.',
+    input_schema: {
+      type: 'object',
+      properties: {
+        result: { type: 'string', description: 'Summary of the completed task and any results.' },
+        data: { description: 'Optional structured data returned from the task (any type).' },
+      },
+      required: ['result'],
+    },
+  },
+];

package/src/cdp/client.js ADDED Viewed

@@ -0,0 +1,123 @@
+import { EventEmitter } from 'events';
+import WebSocket from 'ws';
+import logger from '../utils/logger.js';
+const DEFAULT_TIMEOUT = 30_000;
+/**
+ * Low-level CDP client over WebSocket.
+ * Supports both browser-level and page-level (session) connections.
+ */
+export class CDPClient extends EventEmitter {
+  #ws = null;
+  #pending = new Map();   // id → { resolve, reject, timer }
+  #nextId = 1;
+  #sessionId = null;
+  constructor({ url, sessionId = null } = {}) {
+    super();
+    this.url = url;
+    this.#sessionId = sessionId;
+  }
+  get sessionId() { return this.#sessionId; }
+  async connect() {
+    return new Promise((resolve, reject) => {
+      const ws = new WebSocket(this.url);
+      this.#ws = ws;
+      ws.once('open', () => {
+        logger.debug({ url: this.url }, 'CDP WebSocket connected');
+        resolve();
+      });
+      ws.once('error', (err) => {
+        reject(err);
+      });
+      ws.on('message', (data) => {
+        this.#onMessage(JSON.parse(data.toString()));
+      });
+      ws.on('close', (code, reason) => {
+        logger.debug({ code }, 'CDP WebSocket closed');
+        this.emit('disconnected', { code, reason: reason.toString() });
+        // Reject all pending
+        for (const [id, { reject: rej, timer }] of this.#pending) {
+          clearTimeout(timer);
+          rej(new Error(`CDP connection closed (code ${code})`));
+          this.#pending.delete(id);
+        }
+      });
+    });
+  }
+  #onMessage(msg) {
+    // Flat event (browser-level)
+    if (msg.method) {
+      const event = this.#sessionId ? `${this.#sessionId}.${msg.method}` : msg.method;
+      this.emit(msg.method, msg.params);
+      this.emit(event, msg.params);
+      return;
+    }
+    // Session-multiplexed event
+    if (msg.sessionId && msg.method === undefined && msg.id === undefined) {
+      this.emit(`session.${msg.sessionId}`, msg);
+      return;
+    }
+    // Response to a command
+    if (msg.id !== undefined) {
+      const entry = this.#pending.get(msg.id);
+      if (!entry) return;
+      clearTimeout(entry.timer);
+      this.#pending.delete(msg.id);
+      if (msg.error) {
+        const err = new Error(msg.error.message || 'CDP error');
+        err.code = msg.error.code;
+        err.data = msg.error.data;
+        entry.reject(err);
+      } else {
+        entry.resolve(msg.result ?? {});
+      }
+    }
+  }
+  /**
+   * Send a CDP command and return the result.
+   * @param {string} method
+   * @param {object} [params]
+   * @param {number} [timeout]
+   * @returns {Promise<object>}
+   */
+  send(method, params = {}, timeout = DEFAULT_TIMEOUT) {
+    return new Promise((resolve, reject) => {
+      if (!this.#ws || this.#ws.readyState !== WebSocket.OPEN) {
+        return reject(new Error('CDP WebSocket is not open'));
+      }
+      const id = this.#nextId++;
+      const msg = { id, method, params };
+      if (this.#sessionId) msg.sessionId = this.#sessionId;
+      const timer = setTimeout(() => {
+        this.#pending.delete(id);
+        reject(new Error(`CDP command timed out: ${method}`));
+      }, timeout);
+      this.#pending.set(id, { resolve, reject, timer });
+      logger.debug({ method, params }, 'CDP →');
+      this.#ws.send(JSON.stringify(msg));
+    });
+  }
+  async close() {
+    if (this.#ws) {
+      this.#ws.close();
+      this.#ws = null;
+    }
+  }
+}