npm - browser-use - Versions diffs - 0.6.0 → 0.6.1 - Mend

browser-use 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md +22 -17
package/dist/agent/service.js +11 -2
package/dist/cli.js +1 -1
package/dist/config.js +1 -1
package/dist/controller/registry/views.d.ts +2 -0
package/dist/controller/registry/views.js +44 -17
package/dist/observability.js +1 -1
package/dist/utils.js +1 -1
package/package.json +10 -9

package/README.md CHANGED Viewed

@@ -23,7 +23,7 @@
 ## ✨ Features
 - 🤖 **Autonomous Browser Control** — AI-driven navigation, clicking, typing, form filling, scrolling, and tab management
-- 🧠 **10+ LLM Providers** — OpenAI, Anthropic, Google Gemini, Azure, AWS Bedrock, Groq, Ollama, DeepSeek, OpenRouter, Mistral, Cerebras, and custom providers
+- 🧠 **15+ LLM Providers & Adapters** — OpenAI, Anthropic, Google Gemini, Azure, AWS Bedrock, Groq, Ollama, DeepSeek, OpenRouter, Mistral, Cerebras, Browser Use, LiteLLM, OCI Raw, Vercel, and custom providers
 - 👁️ **Vision Support** — Screenshot-based understanding for visual web interactions
 - 🔧 **45+ Built-in Actions** — Navigation, element interaction, scrolling, forms, tabs, content extraction, file I/O, and more
 - 🧩 **Custom Actions** — Extensible registry with Zod schema validation, domain restrictions, and page filters
@@ -112,7 +112,7 @@ npx browser-use --mcp
 | **BrowserSession** | Playwright wrapper — browser lifecycle, tab management, screenshots    |
 | **DomService**     | Extracts interactive elements with indexed mapping for LLM consumption |
 | **MessageManager** | Manages LLM conversation history with token optimization               |
-| **LLM Providers**  | Unified `BaseChatModel` interface across 10+ providers                 |
+| **LLM Providers**  | Unified `BaseChatModel` interface across 15+ providers and adapters    |
 ### How It Works
@@ -125,19 +125,23 @@ npx browser-use --mcp
 ## 🔌 LLM Providers
-| Provider          | Import                       | Vision | Notes                                         |
-| ----------------- | ---------------------------- | ------ | --------------------------------------------- |
-| **OpenAI**        | `browser-use/llm/openai`     | ✅     | Default provider, reasoning models (o1/o3/o4) |
-| **Anthropic**     | `browser-use/llm/anthropic`  | ✅     | Prompt caching support                        |
-| **Google Gemini** | `browser-use/llm/google`     | ✅     | Extended thinking support                     |
-| **Azure OpenAI**  | `browser-use/llm/azure`      | ✅     | Enterprise deployment                         |
-| **AWS Bedrock**   | `browser-use/llm/aws`        | ✅     | Claude via AWS                                |
-| **Groq**          | `browser-use/llm/groq`       | ❌     | Fastest inference                             |
-| **Ollama**        | `browser-use/llm/ollama`     | ❌     | Local/self-hosted models                      |
-| **DeepSeek**      | `browser-use/llm/deepseek`   | ❌     | Cost-effective                                |
-| **OpenRouter**    | `browser-use/llm/openrouter` | Varies | Multi-model routing                           |
-| **Mistral**       | `browser-use/llm/mistral`    | Varies | Mistral models                                |
-| **Cerebras**      | `browser-use/llm/cerebras`   | ❌     | Fast inference                                |
+| Provider          | Import                        | Vision | Notes                                         |
+| ----------------- | ----------------------------- | ------ | --------------------------------------------- |
+| **OpenAI**        | `browser-use/llm/openai`      | ✅     | Default provider, reasoning models (o1/o3/o4) |
+| **Anthropic**     | `browser-use/llm/anthropic`   | ✅     | Prompt caching support                        |
+| **Google Gemini** | `browser-use/llm/google`      | ✅     | Extended thinking support                     |
+| **Azure OpenAI**  | `browser-use/llm/azure`       | ✅     | Enterprise deployment                         |
+| **AWS Bedrock**   | `browser-use/llm/aws`         | ✅     | Claude via AWS                                |
+| **Groq**          | `browser-use/llm/groq`        | ❌     | Fastest inference                             |
+| **Ollama**        | `browser-use/llm/ollama`      | ❌     | Local/self-hosted models                      |
+| **DeepSeek**      | `browser-use/llm/deepseek`    | ❌     | Cost-effective                                |
+| **OpenRouter**    | `browser-use/llm/openrouter`  | Varies | Multi-model routing                           |
+| **Mistral**       | `browser-use/llm/mistral`     | Varies | Mistral models                                |
+| **Cerebras**      | `browser-use/llm/cerebras`    | ❌     | Fast inference                                |
+| **Browser Use**   | `browser-use/llm/browser-use` | Varies | Hosted Browser Use LLM                        |
+| **LiteLLM**       | `browser-use/llm/litellm`     | Varies | OpenAI-compatible LiteLLM gateway             |
+| **OCI Raw**       | `browser-use/llm/oci-raw`     | Varies | Oracle Cloud Generative AI                    |
+| **Vercel**        | `browser-use/llm/vercel`      | Varies | Vercel AI Gateway / routed models             |
 <details>
 <summary>Provider examples</summary>
@@ -210,6 +214,7 @@ const agent = new Agent({
 ### Custom Actions
 ```typescript
+import fs from 'node:fs';
 import { Controller, ActionResult } from 'browser-use';
 import { z } from 'zod';
@@ -345,7 +350,7 @@ Add to your Claude Desktop config (`~/Library/Application Support/Claude/claude_
 }
 ```
-Available MCP tools: `browser_run_task`, `browser_navigate`, `browser_click`, `browser_type`, `browser_scroll`, `browser_get_state`, `browser_extract`, `browser_screenshot`, `browser_close`.
+Core MCP tools include `retry_with_browser_use_agent`, `browser_navigate`, `browser_click`, `browser_type`, `browser_get_state`, `browser_extract_content`, `browser_scroll`, `browser_go_back`, `browser_list_tabs`, `browser_switch_tab`, `browser_close_tab`, `browser_list_sessions`, `browser_close_session`, and `browser_close_all`. The server also exposes registered controller actions as additional MCP tools.
 > See [MCP Server Guide](./docs/MCP_SERVER.md) for more details.
@@ -423,4 +428,4 @@ pnpm exec tsx examples/simple-search.ts
 ## 📄 License
-[MIT](./LICENSE) © Web LLM
+[MIT](./LICENSE)

package/dist/agent/service.js CHANGED Viewed

@@ -34,7 +34,7 @@ import { AgentTelemetryEvent } from '../telemetry/views.js';
 import { TokenCost } from '../tokens/service.js';
 import { construct_judge_messages, construct_simple_judge_messages, } from './judge.js';
 import { CloudSkillService, MissingCookieException, build_skill_parameters_schema, get_skill_slug, } from '../skills/index.js';
-loadEnv();
+loadEnv({ quiet: true });
 const logger = createLogger('browser_use.agent');
 const URL_PATTERN = /https?:\/\/[^\s<>"']+|www\.[^\s<>"']+|[^\s<>"']+\.[a-z]{2,}(?:\/[^\s<>"']*)?/gi;
 export const log_response = (response, registry, logInstance = logger) => {
@@ -3886,7 +3886,16 @@ export class Agent {
                 {});
             const paramsResult = actionInfo.paramSchema.safeParse(rawParams);
             if (!paramsResult.success) {
-                throw new Error(`Invalid parameters for action '${requestedActionName}': ${paramsResult.error.message}`);
+                // Surface a human-readable issue list (zod v4 `prettifyError`) plus
+                // a corrective hint, rather than the default JSON dump of `.issues`.
+                // This Error propagates → `_handle_step_error` writes it into
+                // `state.last_result` → `create_state_messages` injects it into the
+                // next LLM turn, so the model knows exactly what shape it got wrong.
+                const pretty = z.prettifyError(paramsResult.error);
+                const sentParams = JSON.stringify(rawParams);
+                throw new Error(`Schema validation failed for action '${requestedActionName}'. ` +
+                    `You sent: ${sentParams}. Issues:\n${pretty}\n` +
+                    `Please retry with parameters matching the action's schema exactly.`);
             }
             normalizedActions.push(new modelForStep({
                 [actionName]: paramsResult.data,

package/dist/cli.js CHANGED Viewed

@@ -33,7 +33,7 @@ import { setupLogging } from './logging-config.js';
 import { get_tunnel_manager } from './skill-cli/tunnel.js';
 import { DeviceAuthClient, save_cloud_api_token } from './sync/auth.js';
 import dotenv from 'dotenv';
-dotenv.config();
+dotenv.config({ quiet: true });
 const require = createRequire(import.meta.url);
 const CLI_PROVIDER_ALIASES = {
     openai: 'openai',

package/dist/config.js CHANGED Viewed

@@ -4,7 +4,7 @@ import path from 'node:path';
 import { randomUUID } from 'node:crypto';
 import { config as loadEnv } from 'dotenv';
 import { createLogger } from './logging-config.js';
-loadEnv();
+loadEnv({ quiet: true });
 const logger = createLogger('browser_use.config');
 const expand_user = (value) => value.replace(/^~(?=$|\/|\\)/, os.homedir());
 const resolve_path = (value) => path.resolve(expand_user(value));

package/dist/controller/registry/views.d.ts CHANGED Viewed

@@ -4,6 +4,7 @@ export type ActionHandler = (...args: any[]) => Promise<unknown> | unknown;
 type BrowserSession = unknown;
 type BaseChatModel = unknown;
 type FileSystem = unknown;
+export declare function renderParamsJsonSchema(schema: ZodTypeAny, skipKeys: Set<string>): Record<string, unknown>;
 export declare class RegisteredAction {
     readonly name: string;
     readonly description: string;
@@ -13,6 +14,7 @@ export declare class RegisteredAction {
     readonly pageFilter: ((page: Page) => boolean) | null;
     readonly terminates_sequence: boolean;
     constructor(name: string, description: string, handler: ActionHandler, paramSchema: ZodTypeAny, domains?: string[] | null, pageFilter?: ((page: Page) => boolean) | null, terminates_sequence?: boolean);
+    getPromptJsonSchema(): Record<string, unknown>;
     promptDescription(): string;
 }
 export declare class ActionModel {

package/dist/controller/registry/views.js CHANGED Viewed

@@ -15,6 +15,38 @@ const getPageUrl = (page) => {
     }
     return candidate ?? '';
 };
+// Render an action's param schema as compact JSON Schema for the LLM prompt.
+// Replaces a prior raw dump of zod's private `_def` AST, which leaked
+// internal keys like `innerType`/`defaultValue` and confused the LLM into
+// copying default booleans into numeric fields (see scroll.num_pages bug).
+export function renderParamsJsonSchema(schema, skipKeys) {
+    // `io: 'input'` makes zod render the *input* shape (what the LLM is
+    // expected to provide). Without it, fields with `.default(...)` get marked
+    // as required in the JSON Schema (because the parsed *output* always has
+    // them), which misleads the model — e.g. scroll.num_pages, done.success.
+    const raw = z.toJSONSchema(schema, {
+        io: 'input',
+        unrepresentable: 'any',
+    });
+    // Strip dialect noise the LLM doesn't need.
+    delete raw.$schema;
+    const properties = raw.properties ?? {};
+    const filteredProps = {};
+    for (const [key, value] of Object.entries(properties)) {
+        if (skipKeys.has(key)) {
+            continue;
+        }
+        filteredProps[key] = value;
+    }
+    raw.properties = filteredProps;
+    if (Array.isArray(raw.required)) {
+        raw.required = raw.required.filter((key) => typeof key === 'string' && !skipKeys.has(key));
+        if (raw.required.length === 0) {
+            delete raw.required;
+        }
+    }
+    return raw;
+}
 export class RegisteredAction {
     name;
     description;
@@ -32,10 +64,12 @@ export class RegisteredAction {
         this.pageFilter = pageFilter;
         this.terminates_sequence = terminates_sequence;
     }
-    promptDescription() {
+    // Returns the JSON Schema rendered for the LLM prompt, with the same
+    // skipKeys logic applied as in `promptDescription`. Exposed so tooling
+    // (e.g. scripts/dump-schema.ts) can exercise the exact code path the
+    // model sees.
+    getPromptJsonSchema() {
         const skipKeys = new Set(['title']);
-        let description = `${this.description}: \n`;
-        description += `{${this.name}: `;
         const schemaShape = (this.paramSchema instanceof z.ZodObject && this.paramSchema.shape) ||
             ('shape' in this.paramSchema ? this.paramSchema.shape : null);
         const hideStructuredDoneSuccess = Boolean(this.name === 'done' &&
@@ -46,26 +80,19 @@ export class RegisteredAction {
         if (hideStructuredDoneSuccess) {
             skipKeys.add('success');
         }
-        const hideExtractOutputSchema = Boolean(this.name === 'extract_structured_data' &&
+        const hideExtractOutputSchema = Boolean((this.name === 'extract_structured_data' || this.name === 'extract') &&
             schemaShape &&
             typeof schemaShape === 'object' &&
             Object.prototype.hasOwnProperty.call(schemaShape, 'output_schema'));
         if (hideExtractOutputSchema) {
             skipKeys.add('output_schema');
         }
-        if (schemaShape) {
-            const props = Object.fromEntries(Object.entries(schemaShape)
-                .filter(([key]) => !skipKeys.has(key))
-                .map(([key, value]) => {
-                const entries = value instanceof z.ZodType ? value._def : value;
-                const cleanEntries = Object.fromEntries(Object.entries(entries).filter(([propKey]) => !skipKeys.has(propKey)));
-                return [key, cleanEntries];
-            }));
-            description += JSON.stringify(props);
-        }
-        else {
-            description += '{}';
-        }
+        return renderParamsJsonSchema(this.paramSchema, skipKeys);
+    }
+    promptDescription() {
+        let description = `${this.description}: \n`;
+        description += `{${this.name}: `;
+        description += JSON.stringify(this.getPromptJsonSchema());
         description += '}';
         return description;
     }

package/dist/observability.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { createRequire } from 'node:module';
 import { config as loadEnv } from 'dotenv';
 import { createLogger } from './logging-config.js';
-loadEnv();
+loadEnv({ quiet: true });
 const require = createRequire(import.meta.url);
 const logger = createLogger('browser_use.observability');
 let lmnrObserve = null;

package/dist/utils.js CHANGED Viewed

@@ -10,7 +10,7 @@ import { fileURLToPath } from 'node:url';
 import { config as loadEnv } from 'dotenv';
 import * as minimatchModule from 'minimatch';
 import { createLogger } from './logging-config.js';
-loadEnv();
+loadEnv({ quiet: true });
 const logger = createLogger('browser_use.utils');
 let _exiting = false;
 const minimatch = (minimatchModule.minimatch ??

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "browser-use",
-  "version": "0.6.0",
+  "version": "0.6.1",
   "description": "A TypeScript-first library for programmatic browser control, designed for building AI-powered web agents.",
   "main": "./dist/index.js",
   "module": "./dist/index.js",
@@ -264,6 +264,11 @@
   "engines": {
     "node": ">=18.0.0"
   },
+  "config": {
+    "commitizen": {
+      "path": "cz-conventional-changelog"
+    }
+  },
   "keywords": [
     "browser",
     "use",
@@ -285,7 +290,7 @@
     "@google/genai": "^1.40.0",
     "@modelcontextprotocol/sdk": "^1.27.1",
     "adm-zip": "^0.5.16",
-    "axios": "^1.13.4",
+    "axios": "^1.16.0",
     "canvas": "^3.2.1",
     "dotenv": "^17.2.4",
     "eventemitter3": "^5.0.4",
@@ -318,6 +323,7 @@
     "@typescript-eslint/parser": "^8.54.0",
     "@vitest/coverage-v8": "^4.0.18",
     "commitizen": "^4.3.1",
+    "cz-conventional-changelog": "^3.3.0",
     "eslint": "^9.39.2",
     "eslint-config-prettier": "^10.1.8",
     "eslint-plugin-import": "^2.32.0",
@@ -326,14 +332,9 @@
     "prettier": "^3.8.1",
     "tsx": "^4.21.0",
     "typescript": "^5.9.3",
-    "vite": "^7.3.1",
+    "vite": "^7.3.2",
     "vitest": "^4.0.18"
   },
-  "config": {
-    "commitizen": {
-      "path": "cz-conventional-changelog"
-    }
-  },
   "scripts": {
     "build": "node scripts/clean-dist.mjs && tsc && node scripts/copy-dom-tree.mjs",
     "build:watch": "tsc --watch --preserveWatchOutput",
@@ -349,12 +350,12 @@
     "test:watch": "vitest --watch",
     "test:pack": "node scripts/smoke-pack.mjs",
     "check": "pnpm lint && pnpm typecheck && pnpm typecheck:test && pnpm test:unit && pnpm test:integration && pnpm test:e2e && pnpm test:pack",
+    "commit": "cz",
     "typecheck": "tsc --noEmit",
     "typecheck:test": "tsc -p tsconfig.test.json --noEmit",
     "format": "prettier --write \"src/**/*.ts\" \"test/**/*.ts\"",
     "format:check": "prettier --check \"src/**/*.ts\" \"test/**/*.ts\"",
     "prettier": "prettier --write \"src/**/*.ts\" \"test/**/*.ts\"",
-    "commit": "pnpm exec git-cz",
     "postinstall": "playwright install chromium",
     "postinstall:ci": "playwright install --with-deps chromium"
   }