npm - @akshayram1/omnibrowser-agent - Versions diffs - 0.2.29 → 0.3.0 - Mend

@akshayram1/omnibrowser-agent 0.2.29 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/.env +1 -0
package/README.github.md +330 -0
package/README.md +94 -167
package/README.npm.md +220 -0
package/demo1.gif +0 -0
package/dist/background.js +1 -1
package/dist/background.js.map +1 -1
package/dist/content.js +63 -15
package/dist/content.js.map +3 -3
package/dist/lib.js +103 -19
package/dist/lib.js.map +3 -3
package/dist/manifest.json +1 -1
package/dist/types/lib/index.d.ts +1 -0
package/dist/types/shared/contracts.d.ts +2 -0
package/dist/types/shared/parse-action.d.ts +1 -0
package/dist/types/shared/safety.d.ts +2 -2
package/icons/big.png +0 -0
package/icons/logo.png +0 -0
package/icons/logo_horizontal.png +0 -0
package/notebook/.env +1 -0
package/notebook/README.md +39 -0
package/notebook/custom_quantized_llm_colab copy.ipynb +7084 -0
package/notebook/data/omnibrowser_planner_train.jsonl +500 -0
package/package.json +4 -2
package/.github/workflows/ci.yml +0 -41
package/docs/ARCHITECTURE.md +0 -64
package/docs/DEPLOYMENT.md +0 -67
package/docs/EMBEDDING.md +0 -74
package/docs/ROADMAP.md +0 -29
package/docs/arch.md +0 -220
package/index.html +0 -1448
package/plan.md +0 -114
package/styles.css +0 -845
package/vercel.json +0 -11

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@akshayram1/omnibrowser-agent",
-  "version": "0.2.29",
+  "version": "0.3.0",
   "private": false,
   "type": "module",
   "main": "./dist/lib.js",
@@ -20,7 +20,9 @@
     "build:types": "tsc -p tsconfig.lib.json",
     "watch": "node scripts/build.mjs --watch",
     "typecheck": "tsc --noEmit && tsc -p tsconfig.test.json --noEmit",
-    "test": "node --experimental-strip-types --test src/shared/safety.test.ts src/core/planner.test.ts"
+    "test": "node --experimental-strip-types --test src/shared/safety.test.ts src/core/planner.test.ts",
+    "prepack": "cp README.md README.github.md && cp README.npm.md README.md",
+    "postpack": "mv README.github.md README.md"
   },
   "devDependencies": {
     "@types/chrome": "^0.0.322",

package/.github/workflows/ci.yml DELETED Viewed

@@ -1,41 +0,0 @@
-name: CI
-on:
-  push:
-    branches: [main]
-permissions:
-  contents: write
-jobs:
-  ci:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
-      - uses: actions/setup-node@v4
-        with:
-          node-version: '22'
-          registry-url: 'https://registry.npmjs.org'
-      - name: Install dependencies
-        run: npm install
-      - name: Run tests
-        run: npm test
-      - name: Bump patch version
-        id: bump
-        run: |
-          NEW_VERSION=$(npm version patch --no-git-tag-version)
-          echo "version=$NEW_VERSION" >> "$GITHUB_OUTPUT"
-      - name: Commit version bump
-        run: |
-          git config user.name "github-actions[bot]"
-          git config user.email "github-actions[bot]@users.noreply.github.com"
-          git add package.json
-          git commit -m "chore: bump version to ${{ steps.bump.outputs.version }} [skip ci]"
-          git push

package/docs/ARCHITECTURE.md DELETED Viewed

@@ -1,64 +0,0 @@
-# OmniBrowser Agent Architecture (v0.2)
-## Goals
-- Local-first runtime in browser
-- Privacy-first defaults
-- Open-source composable planner/executor contracts
-- Human-approved mode for risky actions
-## Runtime Components
-1. Popup UI (`src/popup`)
-   - Starts/stops sessions
-   - Picks mode (`autonomous`, `human-approved`)
-   - Picks planner (`heuristic`, `webllm`)
-2. Background Service Worker (`src/background`)
-   - Session state machine per tab
-   - Tick loop orchestration
-   - Approval handling
-3. Content Agent (`src/content`)
-   - `observer`: page snapshot extraction
-   - `planner`: next-action decision (heuristic / WebLLM)
-   - `safety`: risk gating (`safe`, `review`, `blocked`)
-   - `executor`: DOM action execution
-## Contracts
-- Shared in `src/shared/contracts.ts`
-- Action protocol:
-  - click
-  - type
-  - navigate
-  - extract
-  - scroll
-  - focus
-  - wait
-  - done
-## Safety Model
-- Block invalid URL protocols
-- Review risky actions (submit/delete/pay-like selectors)
-- In `human-approved` mode, review-level actions require manual approval
-## Planner Bridges
-All planner bridges follow the same pattern: an object attached to `window` that implements a `plan()` method returning an `AgentAction`. The core library has zero runtime dependencies — bridge implementations are provided by the consumer.
-### WebLLM bridge
-```ts
-window.__browserAgentWebLLM = {
-  async plan(input, modelId) { /* call local WebLLM engine, return AgentAction */ }
-};
-```
-## Limitations (v0.2)
-- No persistent long-term memory yet
-- No task DSL/skills registry yet
-- Risk scoring is simple keyword heuristic
-- Selector healing is basic (attribute fallback + single-element shortcut)

package/docs/DEPLOYMENT.md DELETED Viewed

@@ -1,67 +0,0 @@
-# Deployment Guide
-## npm Package
-### Publish a new version manually
-```bash
-npm run build
-npm publish --access public
-```
-The CI pipeline auto-bumps the patch version on every push to `main`, so manual version bumps are only needed for minor/major releases:
-```bash
-npm version minor   # 0.2.x → 0.3.0
-npm version major   # 0.x.y → 1.0.0
-npm run build
-npm publish --access public
-```
-### Required secret
-Add `NPM_TOKEN` to your GitHub repository secrets if you want the pipeline to publish automatically (not enabled by default).
----
-## Vercel (Static Site / Chatbot Demo)
-The homepage and chatbot demo are static files served from the repo root.
-1. Import the repository at [vercel.com/new](https://vercel.com/new).
-2. Vercel picks up `vercel.json` automatically — no extra configuration needed.
-3. Every push to `main` triggers a new deployment.
-`vercel.json` key settings:
-```json
-{
-  "buildCommand": null,
-  "outputDirectory": "."
-}
-```
----
-## Chrome Extension (local / sideload)
-1. Build the extension bundle:
-   ```bash
-   npm run build
-   ```
-2. Open `chrome://extensions` and enable **Developer mode**.
-3. Click **Load unpacked** and select the `public/` folder.
-To update after code changes, rebuild and click the refresh icon on the extension card.
----
-## CI Pipeline
-The GitHub Actions workflow (`.github/workflows/ci.yml`) runs on every push to `main`:
-1. Installs dependencies.
-2. Runs `npm test` (Node built-in test runner, no extra deps).
-3. Bumps the patch version in `package.json` via `npm version patch`.
-4. Commits and pushes the version bump with `[skip ci]` to avoid a second run.
-The commit is made by `github-actions[bot]` using the built-in `GITHUB_TOKEN` — no extra secrets required.

package/docs/EMBEDDING.md DELETED Viewed

@@ -1,74 +0,0 @@
-# Embedding OmniBrowser Agent in Your Website
-You can keep the extension flow and also embed OmniBrowser Agent as a library in your own web app.
-## Install
-```bash
-npm install @akshayram1/omnibrowser-agent
-```
-## Basic usage
-```ts
-import { createBrowserAgent } from "@akshayram1/omnibrowser-agent";
-const agent = createBrowserAgent(
-  {
-    goal: "Search contact Jane Doe and open profile",
-    mode: "human-approved",
-    planner: { kind: "heuristic" },
-    maxSteps: 15,
-    stepDelayMs: 400
-  },
-  {
-    onStep: (result) => console.log("step", result),
-    onApprovalRequired: (action) => {
-      console.log("approval required", action);
-      // Show your own modal/button then call approvePendingAction()
-    },
-    onDone: (result) => console.log("done", result),
-    onError: (error) => console.error(error)
-  }
-);
-await agent.start();
-```
-## Approve a pending action
-```ts
-await agent.approvePendingAction();
-```
-## Stop running session
-```ts
-agent.stop();
-```
-## WebLLM mode in embedded app
-To use planner mode `webllm`, load the WebLLM engine and wire the bridge before starting the agent:
-```ts
-import * as webllm from "@mlc-ai/web-llm";
-import { createBrowserAgent, createWebLLMBridge } from "@akshayram1/omnibrowser-agent";
-const engine = await webllm.CreateMLCEngine("Llama-3.2-1B-Instruct-q4f16_1-MLC");
-window.__browserAgentWebLLM = createWebLLMBridge(engine);
-const agent = createBrowserAgent({
-  goal: "Fill the contact form",
-  planner: { kind: "webllm", modelId: "Llama-3.2-1B-Instruct-q4f16_1-MLC" }
-});
-await agent.start();
-```
-## Notes
-- For production, mount this inside an authenticated app shell and add your own permission checks.
-- `human-approved` mode is recommended for CRM/finance/admin actions.
-- Bring your own WebLLM engine instance, then wire `createWebLLMBridge(engine)` to `window.__browserAgentWebLLM`.

package/docs/ROADMAP.md DELETED Viewed

@@ -1,29 +0,0 @@
-# Roadmap
-## v0.1
-- Extension runtime loop
-- Shared action contracts
-- Heuristic + WebLLM planner switch
-- Human-approved mode
-## v0.2 (current)
-- New actions: `scroll`, `focus`
-- Improved heuristic planner with regex goal patterns
-- Better page observation (visibility filtering, placeholder capture)
-- Library API: `resume()`, `isRunning`, `hasPendingAction`, `AbortSignal`, `onMaxStepsReached`
-## v0.3
-- Expanded WebLLM model catalog (new 7B/8B options + compatibility matrix)
-- Improved model loading UX (recommended presets by speed/quality and device memory)
-- Enhanced default system prompts for safer, clearer multi-step planning
-- Prompt presets for common workflows (docs navigation, CRM form fill, task automation)
-## v1.0
-- Advanced prompt orchestration (goal-aware system prompt routing and contextual guardrails)
-- Functionality expansion: richer action toolkit and stronger extraction/navigation reliability
-- Adaptive planner behaviour (model-aware retries, fallback strategies, and recovery flows)
-- Evaluation suite for prompt and model quality across benchmark browser tasks

package/docs/arch.md DELETED Viewed

@@ -1,220 +0,0 @@
-# omnibrowser-agent — Architecture
-> Local-first browser AI operator. Runs entirely in the browser — no API keys, no cloud costs, no data leaving your machine.
----
-## Architecture Diagram
-```mermaid
-flowchart TB
-    subgraph DELIVERY["Delivery Layer"]
-        EXT["🧩 Chrome Extension\npopup + background worker"]
-        LIB["📦 npm Library\ncreateBrowserAgent()"]
-    end
-    subgraph ORCHESTRATION["Orchestration"]
-        BG["background/index.ts\nSession & tick loop"]
-        BA["BrowserAgent class\nrunLoop() / resume() / stop()"]
-    end
-    subgraph CORE["Core  (src/core/)"]
-        PL["planner.ts\nheuristicPlan() / webllm bridge\nplanNextAction()"]
-        OB["observer.ts\ncollectSnapshot()\nDOM candidates + visibility filter"]
-        EX["executor.ts\nexecuteAction()\nclick / type / navigate\nscroll / focus / wait"]
-    end
-    subgraph SHARED["Shared  (src/shared/)"]
-        CT["contracts.ts\nAgentAction · PageSnapshot\nAgentSession · PlannerResult"]
-        SF["safety.ts\nassessRisk()\nsafe / review / blocked"]
-        PA["parse-action.ts\nparseAction()\nparsePlannerResult()"]
-    end
-    subgraph OUTCOMES["Action Outcomes"]
-        direction LR
-        OK["✅ safe → execute"]
-        RV["⚠️ review → needs approval"]
-        BL["🚫 blocked → stop"]
-    end
-    subgraph PLANNERS["Planner Modes"]
-        direction LR
-        HP["Heuristic\nzero deps · offline\nregex patterns"]
-        WL["WebLLM\non-device · WebGPU\nwindow.__browserAgentWebLLM"]
-    end
-    EXT --> BG
-    LIB --> BA
-    BG -. "chrome.tabs.sendMessage" .-> CORE
-    BA --> CORE
-    PL --> OB
-    PL --> SHARED
-    OB --> SHARED
-    EX --> SHARED
-    SF --> OUTCOMES
-    PL --> PLANNERS
-```
----
-## Layer-by-layer explanation
-### Delivery layer
-There are two ways to use omnibrowser-agent, and they share the same underlying engine.
-**Chrome extension** — Install by loading the `dist/` folder as an unpacked extension in Chrome. A popup UI lets you enter a goal, pick a mode, and click Start. The background service worker manages session state and orchestrates the tick loop across tabs.
-**npm library** — Embed agent logic directly into any web app. Import `createBrowserAgent()` from `@akshayram1/omnibrowser-agent`, pass a goal and config, and wire up event callbacks. No extension required.
----
-### Orchestration
-**`background/index.ts`** (extension path) maintains a `Map<tabId, AgentSession>` and drives each session forward by sending `AGENT_TICK` messages to the active tab's content script. It handles `START_AGENT`, `APPROVE_ACTION`, `STOP_AGENT`, and `GET_STATUS` messages from the popup.
-**`BrowserAgent` class** (library path) runs the same tick loop in-process. It exposes `start()`, `resume()`, `stop()`, `isRunning`, and `hasPendingAction`, along with a full event callback API (`onStep`, `onApprovalRequired`, `onDone`, `onError`, `onMaxStepsReached`). Supports `AbortSignal` for external cancellation.
----
-### Core  (`src/core/`)
-These three modules are **shared** between the extension content script and the library. Neither delivery path duplicates them.
-| Module | Responsibility |
-|---|---|
-| `planner.ts` | Decides the next action given a goal, page snapshot, and history |
-| `observer.ts` | Reads the live DOM and returns a structured `PageSnapshot` |
-| `executor.ts` | Performs DOM actions and returns a result string |
-**`observer.ts` — `collectSnapshot()`**
-Queries all interactive elements (`a`, `button`, `input`, `textarea`, `select`, `[role=button]`, `[contenteditable]`), filters out invisible ones (hidden, `display:none`, zero dimensions), and prioritises in-viewport elements. Resolves accessible labels via `aria-labelledby`, `aria-label`, `for/id`, and wrapping `<label>`. Generates stable CSS selectors preferring `name`, `placeholder`, and `aria-label` attributes over fragile `:nth-of-type()` indices. Caps at 60 candidates. Returns `url`, `title`, `textPreview`, and `candidates[]`.
-**`planner.ts` — `planNextAction()`**
-Two modes:
-- *Heuristic* — pure regex. Matches `go to <url>`, `search for <x>`, `fill "<text>" in <field>`, `click <target>` patterns against the goal string, then falls back to filling the first visible input or clicking the first visible button.
-- *WebLLM* — delegates to `window.__browserAgentWebLLM.plan()`. The bridge is external — you wire it in. Accepts both legacy `AgentAction` returns and the new `PlannerResult` (with `evaluation`, `memory`, `nextGoal` reflection fields).
-**`executor.ts` — `executeAction()`**
-Performs the action. Uses `InputEvent` with `bubbles: true` so React/Vue controlled inputs receive proper framework events. Verifies: element exists, is not disabled (for clicks), value updated (for type), extracted text is non-empty. Includes selector fallback: when a selector fails, tries to recover via tag+attribute matching or single-element shortcut before throwing. Throws on failure so the retry loop can feed `lastError` back to the planner.
----
-### Shared  (`src/shared/`)
-**`contracts.ts`** — All TypeScript interfaces and union types. The single source of truth for `AgentAction`, `PageSnapshot`, `AgentSession`, `PlannerResult`, `ContentResult`, and the library config/event types.
-**`safety.ts` — `assessRisk()`**
-Returns one of three risk levels for any action:
-| Level | Meaning | Examples |
-|---|---|---|
-| `safe` | Execute immediately | `navigate` to http/https, `click` neutral label, `scroll`, `wait`, `focus` |
-| `review` | Pause for human approval in `human-approved` mode | `extract`, `click`/`type` on labels matching delete/pay/submit/confirm/transfer |
-| `blocked` | Never execute | `navigate` to `javascript:`, `file:`, or malformed URLs |
-**`parse-action.ts`** — Handles LLM output that may be wrapped in markdown fences, embedded in prose, or using the full reflection format `{ evaluation, memory, next_goal, action }`. Gracefully returns a `done` action on any parse failure so the loop never crashes.
----
-### Planner modes
-| Mode | Description | When to use |
-|---|---|---|
-| `heuristic` | Zero-dependency regex-based planner. Works fully offline. | Simple, predictable goals — navigate, search, fill a field, click a button |
-| `webllm` | Delegates to a `window.__browserAgentWebLLM` bridge. Fully private, runs on-device via WebGPU. | Open-ended, multi-step, or language-heavy goals |
----
-### Agent modes
-| Mode | Behaviour |
-|---|---|
-| `autonomous` | All `safe` and `review` actions execute without pause |
-| `human-approved` | `review`-rated actions pause and emit `onApprovalRequired` — user must call `resume()` or click **Approve** in the popup |
----
-### Data flow (one tick)
-```
-goal + history
-      │
-      ▼
-observer.collectSnapshot()  ──→  PageSnapshot (url, title, candidates[])
-      │
-      ▼
-planner.planNextAction()    ──→  PlannerResult { action, evaluation?, memory?, nextGoal? }
-      │
-      ▼
-safety.assessRisk(action)   ──→  safe | review | blocked
-      │
-   ┌──┴──────────────────────┐
-blocked               review (human-approved mode)
-   │                         │
-stop                  pause → user approves → resume
-                             │
-                        safe / approved
-                             │
-                             ▼
-              executor.executeAction(action)  ──→  result string
-                             │
-                             ▼
-                    session.history.push(result)
-                    → next tick
-```
----
-## Project structure
-```
-src/
-├── background/      Extension service worker — session management
-├── content/         Extension content script — runs in page context
-├── core/            Shared engine (planner, observer, executor)
-│   ├── planner.ts
-│   ├── observer.ts
-│   └── executor.ts
-├── lib/             npm library entry — BrowserAgent class
-│   └── index.ts
-├── popup/           Extension popup UI
-│   ├── index.html
-│   └── index.ts
-└── shared/          Types, safety, and parse utilities
-    ├── contracts.ts
-    ├── safety.ts
-    └── parse-action.ts
-```
----
-## Quick reference
-```ts
-import { createBrowserAgent } from "@akshayram1/omnibrowser-agent";
-const agent = createBrowserAgent({
-  goal: "Search for contact John Smith in CRM",
-  mode: "human-approved",        // or "autonomous"
-  planner: { kind: "heuristic" } // or "webllm"
-}, {
-  onStep:            (result, session) => console.log(result.message),
-  onApprovalRequired:(action, session) => console.log("Review:", action),
-  onDone:            (result, session) => console.log("Done:", result.message),
-});
-await agent.start();
-// After onApprovalRequired fires:
-await agent.resume();
-// Cancel at any time:
-agent.stop();
-```
----
-*MIT © Akshay Chame — [github.com/akshayram1/omnibrowser-agent](https://github.com/akshayram1/omnibrowser-agent)*