@akshayram1/omnibrowser-agent 0.2.29 → 0.2.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.github.md +293 -0
- package/README.md +94 -167
- package/README.npm.md +220 -0
- package/package.json +4 -2
- package/.github/workflows/ci.yml +0 -41
- package/docs/ARCHITECTURE.md +0 -64
- package/docs/DEPLOYMENT.md +0 -67
- package/docs/EMBEDDING.md +0 -74
- package/docs/ROADMAP.md +0 -29
- package/docs/arch.md +0 -220
- package/index.html +0 -1448
- package/plan.md +0 -114
- package/styles.css +0 -845
- package/vercel.json +0 -11
package/README.github.md
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
# omnibrowser-agent
|
|
2
|
+
|
|
3
|
+
[](https://www.npmjs.com/package/@akshayram1/omnibrowser-agent)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
|
|
6
|
+
Local-first browser AI operator. Plans and executes DOM actions entirely in the browser — no API keys, no cloud costs, no data leaving your machine.
|
|
7
|
+
|
|
8
|
+
[Live Demo](https://omnibrowser-agent.vercel.app/examples/chatbot/) · [Embedding Guide](docs/EMBEDDING.md) · [Architecture](docs/arch.md) · [Deployment](docs/DEPLOYMENT.md) · [Roadmap](docs/ROADMAP.md)
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## Architecture
|
|
13
|
+
|
|
14
|
+
```mermaid
|
|
15
|
+
flowchart TB
|
|
16
|
+
subgraph DELIVERY["Delivery Layer"]
|
|
17
|
+
EXT["🧩 Chrome Extension\npopup + background worker"]
|
|
18
|
+
LIB["📦 npm Library\ncreateBrowserAgent()"]
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
subgraph ORCHESTRATION["Orchestration"]
|
|
22
|
+
BG["background/index.ts\nSession & tick loop"]
|
|
23
|
+
BA["BrowserAgent class\nrunLoop() / resume() / stop()"]
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
subgraph CORE["Core (src/core/)"]
|
|
27
|
+
PL["planner.ts\nheuristicPlan() / webllm bridge\nplanNextAction()"]
|
|
28
|
+
OB["observer.ts\ncollectSnapshot()\nDOM candidates + visibility filter"]
|
|
29
|
+
EX["executor.ts\nexecuteAction()\nclick / type / navigate\nscroll / focus / wait"]
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
subgraph SHARED["Shared (src/shared/)"]
|
|
33
|
+
CT["contracts.ts\nAgentAction · PageSnapshot\nAgentSession · PlannerResult"]
|
|
34
|
+
SF["safety.ts\nassessRisk()\nsafe / review / blocked"]
|
|
35
|
+
PA["parse-action.ts\nparseAction()\nparsePlannerResult()"]
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
subgraph OUTCOMES["Action Outcomes"]
|
|
39
|
+
direction LR
|
|
40
|
+
OK["✅ safe → execute"]
|
|
41
|
+
RV["⚠️ review → needs approval"]
|
|
42
|
+
BL["🚫 blocked → stop"]
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
subgraph PLANNERS["Planner Modes"]
|
|
46
|
+
direction LR
|
|
47
|
+
HP["Heuristic\nzero deps · offline\nregex patterns"]
|
|
48
|
+
WL["WebLLM\non-device · WebGPU\nwindow.__browserAgentWebLLM"]
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
EXT --> BG
|
|
52
|
+
LIB --> BA
|
|
53
|
+
BG -. "chrome.tabs.sendMessage" .-> CORE
|
|
54
|
+
BA --> CORE
|
|
55
|
+
|
|
56
|
+
PL --> OB
|
|
57
|
+
PL --> SHARED
|
|
58
|
+
OB --> SHARED
|
|
59
|
+
EX --> SHARED
|
|
60
|
+
|
|
61
|
+
SF --> OUTCOMES
|
|
62
|
+
PL --> PLANNERS
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## How it works — one tick
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
goal + history + memory
|
|
71
|
+
│
|
|
72
|
+
▼
|
|
73
|
+
observer.collectSnapshot() ──→ PageSnapshot (url, title, candidates[])
|
|
74
|
+
│
|
|
75
|
+
▼
|
|
76
|
+
planner.planNextAction() ──→ PlannerResult { action, evaluation?, memory?, nextGoal? }
|
|
77
|
+
│
|
|
78
|
+
▼
|
|
79
|
+
safety.assessRisk(action) ──→ safe | review | blocked
|
|
80
|
+
│
|
|
81
|
+
┌────┴─────────────────────┐
|
|
82
|
+
blocked review (human-approved mode)
|
|
83
|
+
│ │
|
|
84
|
+
stop pause → user approves → resume
|
|
85
|
+
│
|
|
86
|
+
safe / approved
|
|
87
|
+
│
|
|
88
|
+
▼
|
|
89
|
+
executor.executeAction(action) ──→ result string
|
|
90
|
+
│
|
|
91
|
+
▼
|
|
92
|
+
session.history.push(result)
|
|
93
|
+
→ next tick
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
The planner uses a **reflection loop** before each action: it evaluates what happened last step, maintains working memory across steps, and states its next goal — giving the agent much better multi-step reasoning.
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## Install
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
npm install @akshayram1/omnibrowser-agent
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## Quick start
|
|
109
|
+
|
|
110
|
+
```ts
|
|
111
|
+
import { createBrowserAgent } from "@akshayram1/omnibrowser-agent";
|
|
112
|
+
|
|
113
|
+
const agent = createBrowserAgent({
|
|
114
|
+
goal: "Search for contact Jane Doe and open her profile",
|
|
115
|
+
mode: "human-approved", // or "autonomous"
|
|
116
|
+
planner: { kind: "heuristic" } // or "webllm"
|
|
117
|
+
}, {
|
|
118
|
+
onStep: (result, session) => console.log(result.message),
|
|
119
|
+
onApprovalRequired: (action, session) => console.log("Review:", action),
|
|
120
|
+
onDone: (result, session) => console.log("Done:", result.message),
|
|
121
|
+
onError: (err, session) => console.error(err),
|
|
122
|
+
onMaxStepsReached: (session) => console.log("Max steps hit"),
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
await agent.start();
|
|
126
|
+
|
|
127
|
+
// After onApprovalRequired fires:
|
|
128
|
+
await agent.resume();
|
|
129
|
+
|
|
130
|
+
// Cancel at any time:
|
|
131
|
+
agent.stop();
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
## Planner modes
|
|
137
|
+
|
|
138
|
+
| Mode | Description | When to use |
|
|
139
|
+
|---|---|---|
|
|
140
|
+
| `heuristic` | Zero-dependency regex planner. Works fully offline. | Simple, predictable goals — navigate, fill, click |
|
|
141
|
+
| `webllm` | On-device LLM via WebGPU. Fully private, no API calls. | Open-ended, multi-step, language-heavy goals |
|
|
142
|
+
|
|
143
|
+
### WebLLM with a custom system prompt
|
|
144
|
+
|
|
145
|
+
```ts
|
|
146
|
+
const agent = createBrowserAgent({
|
|
147
|
+
goal: "Fill the checkout form",
|
|
148
|
+
planner: {
|
|
149
|
+
kind: "webllm",
|
|
150
|
+
systemPrompt: "You are a careful checkout assistant. Never submit before all required fields are filled."
|
|
151
|
+
}
|
|
152
|
+
});
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
See [docs/EMBEDDING.md](docs/EMBEDDING.md) for the full WebLLM bridge wiring guide.
|
|
156
|
+
|
|
157
|
+
### Recommended WebLLM models
|
|
158
|
+
|
|
159
|
+
- `Llama-3.2-1B-Instruct-q4f16_1-MLC` — fast, ~600 MB
|
|
160
|
+
- `Llama-3.2-3B-Instruct-q4f16_1-MLC` — better quality, ~1.5 GB
|
|
161
|
+
- `Phi-3.5-mini-instruct-q4f16_1-MLC` — strong quality, ~2 GB
|
|
162
|
+
- `Mistral-7B-Instruct-v0.3-q4f16_1-MLC` — balanced quality, ~4.1 GB
|
|
163
|
+
- `Qwen2.5-7B-Instruct-q4f16_1-MLC` — strongest quality, ~4.3 GB
|
|
164
|
+
- `Llama-3.1-8B-Instruct-q4f16_1-MLC` — strong reasoning, ~4.8 GB
|
|
165
|
+
|
|
166
|
+
Model availability can vary by WebLLM release/build; if one fails to load, use a smaller fallback like `Llama-3.2-1B-Instruct-q4f16_1-MLC`.
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## Agent modes
|
|
171
|
+
|
|
172
|
+
| Mode | Behaviour |
|
|
173
|
+
|---|---|
|
|
174
|
+
| `autonomous` | All `safe` and `review` actions execute without pause |
|
|
175
|
+
| `human-approved` | `review`-rated actions pause and emit `onApprovalRequired` — call `resume()` to continue |
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
## Supported actions
|
|
180
|
+
|
|
181
|
+
| Action | Description | Risk |
|
|
182
|
+
|---|---|---|
|
|
183
|
+
| `navigate` | Navigate to a URL (http/https only) | safe |
|
|
184
|
+
| `click` | Click an element by CSS selector | safe / review |
|
|
185
|
+
| `type` | Type text into an input or textarea | safe / review |
|
|
186
|
+
| `scroll` | Scroll a container or the page | safe |
|
|
187
|
+
| `focus` | Focus an element | safe |
|
|
188
|
+
| `wait` | Pause for N milliseconds | safe |
|
|
189
|
+
| `extract` | Extract text from an element | review |
|
|
190
|
+
| `done` | Signal task completion | safe |
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## AbortSignal support
|
|
195
|
+
|
|
196
|
+
```ts
|
|
197
|
+
const controller = new AbortController();
|
|
198
|
+
const agent = createBrowserAgent({ goal: "...", signal: controller.signal });
|
|
199
|
+
agent.start();
|
|
200
|
+
|
|
201
|
+
controller.abort(); // cancel from outside
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## Chrome Extension
|
|
207
|
+
|
|
208
|
+
1. Build:
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
npm run build
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
2. Open `chrome://extensions`, enable **Developer Mode**, click **Load unpacked**, select `dist/`.
|
|
215
|
+
|
|
216
|
+
3. Open any tab, enter a goal in the popup, pick a mode, and click **Start**.
|
|
217
|
+
|
|
218
|
+
See [docs/DEPLOYMENT.md](docs/DEPLOYMENT.md) for publishing and CI pipeline details.
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
## Project structure
|
|
223
|
+
|
|
224
|
+
```
|
|
225
|
+
src/
|
|
226
|
+
├── background/ Extension service worker — session management
|
|
227
|
+
├── content/ Extension content script — runs in page context
|
|
228
|
+
├── core/ Shared engine (planner, observer, executor)
|
|
229
|
+
│ ├── planner.ts
|
|
230
|
+
│ ├── observer.ts
|
|
231
|
+
│ └── executor.ts
|
|
232
|
+
├── lib/ npm library entry — BrowserAgent class
|
|
233
|
+
│ └── index.ts
|
|
234
|
+
├── popup/ Extension popup UI
|
|
235
|
+
└── shared/ Types, safety, and parse utilities
|
|
236
|
+
├── contracts.ts
|
|
237
|
+
├── safety.ts
|
|
238
|
+
└── parse-action.ts
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
---
|
|
242
|
+
|
|
243
|
+
## Changelog
|
|
244
|
+
|
|
245
|
+
### v0.2.6
|
|
246
|
+
|
|
247
|
+
- Reflection-before-action pattern (`evaluation → memory → next_goal → action`) — agent reasons about each step before acting
|
|
248
|
+
- Working memory carried across ticks for better multi-step goals
|
|
249
|
+
- `parsePlannerResult()` exported from the library
|
|
250
|
+
- `systemPrompt` option in `PlannerConfig` — pass your own prompt without rewriting the bridge
|
|
251
|
+
- Thought bubble (💭) messages in the live demo chat showing the agent's next intent
|
|
252
|
+
|
|
253
|
+
### v0.2.4 — v0.2.5
|
|
254
|
+
|
|
255
|
+
- CI pipeline: auto version bump on push to main
|
|
256
|
+
- Removed page-agent dependency — reflection pattern implemented natively
|
|
257
|
+
- Chatbot demo redesign: right-aligned user messages, typing indicator, tab navigation (CRM + Task Manager)
|
|
258
|
+
- `parsePlannerResult()` and `PlannerResult` type exported from library
|
|
259
|
+
|
|
260
|
+
### v0.2.2
|
|
261
|
+
|
|
262
|
+
- SDK/extension separation: core logic in `src/core/` shared between extension and npm library
|
|
263
|
+
- 22 unit tests across planner and safety modules
|
|
264
|
+
- Action verification in executor (disabled-check, value-verify, empty-check)
|
|
265
|
+
- `CandidateElement.label` from associated `<label>` elements
|
|
266
|
+
- Retry loop with `lastError` fed back to planner on failure
|
|
267
|
+
|
|
268
|
+
### v0.2.0
|
|
269
|
+
|
|
270
|
+
- New actions: `scroll` and `focus`
|
|
271
|
+
- Smarter safety: risk assessment checks element label/text
|
|
272
|
+
- Improved heuristic planner with regex pattern matching
|
|
273
|
+
- Better page observation: filters invisible elements, up to 60 candidates
|
|
274
|
+
- Library API: `resume()`, `isRunning`, `hasPendingAction`, `onMaxStepsReached`, `AbortSignal`
|
|
275
|
+
|
|
276
|
+
### v0.1.0
|
|
277
|
+
|
|
278
|
+
- Extension runtime loop, shared action contracts, heuristic + WebLLM planner, human-approved mode
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
## Docs
|
|
283
|
+
|
|
284
|
+
- [Embedding Guide](docs/EMBEDDING.md) — integrate into any web app
|
|
285
|
+
- [Architecture](docs/arch.md) — layer-by-layer breakdown
|
|
286
|
+
- [Deployment](docs/DEPLOYMENT.md) — npm publish, Vercel, Chrome extension, CI
|
|
287
|
+
- [Roadmap](docs/ROADMAP.md) — planned features
|
|
288
|
+
|
|
289
|
+
---
|
|
290
|
+
|
|
291
|
+
## License
|
|
292
|
+
|
|
293
|
+
MIT © Akshay Chame
|
package/README.md
CHANGED
|
@@ -5,96 +5,55 @@
|
|
|
5
5
|
|
|
6
6
|
Local-first browser AI operator. Plans and executes DOM actions entirely in the browser — no API keys, no cloud costs, no data leaving your machine.
|
|
7
7
|
|
|
8
|
-
[Live Demo](https://omnibrowser-agent.vercel.app/examples/chatbot/) · [
|
|
8
|
+
[Live Demo](https://omnibrowser-agent.vercel.app/examples/chatbot/) · [GitHub](https://github.com/akshayram1/omnibrowser-agent) · [Embedding Guide](https://github.com/akshayram1/omnibrowser-agent/blob/main/docs/EMBEDDING.md) · [Roadmap](https://github.com/akshayram1/omnibrowser-agent/blob/main/docs/ROADMAP.md)
|
|
9
9
|
|
|
10
10
|
---
|
|
11
11
|
|
|
12
12
|
## Architecture
|
|
13
13
|
|
|
14
|
-
```
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
CT["contracts.ts\nAgentAction · PageSnapshot\nAgentSession · PlannerResult"]
|
|
34
|
-
SF["safety.ts\nassessRisk()\nsafe / review / blocked"]
|
|
35
|
-
PA["parse-action.ts\nparseAction()\nparsePlannerResult()"]
|
|
36
|
-
end
|
|
37
|
-
|
|
38
|
-
subgraph OUTCOMES["Action Outcomes"]
|
|
39
|
-
direction LR
|
|
40
|
-
OK["✅ safe → execute"]
|
|
41
|
-
RV["⚠️ review → needs approval"]
|
|
42
|
-
BL["🚫 blocked → stop"]
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
subgraph PLANNERS["Planner Modes"]
|
|
46
|
-
direction LR
|
|
47
|
-
HP["Heuristic\nzero deps · offline\nregex patterns"]
|
|
48
|
-
WL["WebLLM\non-device · WebGPU\nwindow.__browserAgentWebLLM"]
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
EXT --> BG
|
|
52
|
-
LIB --> BA
|
|
53
|
-
BG -. "chrome.tabs.sendMessage" .-> CORE
|
|
54
|
-
BA --> CORE
|
|
55
|
-
|
|
56
|
-
PL --> OB
|
|
57
|
-
PL --> SHARED
|
|
58
|
-
OB --> SHARED
|
|
59
|
-
EX --> SHARED
|
|
60
|
-
|
|
61
|
-
SF --> OUTCOMES
|
|
62
|
-
PL --> PLANNERS
|
|
14
|
+
```
|
|
15
|
+
Chrome Extension npm Library
|
|
16
|
+
(popup + bg worker) createBrowserAgent()
|
|
17
|
+
| |
|
|
18
|
+
+----------+-------------+
|
|
19
|
+
|
|
|
20
|
+
Orchestration
|
|
21
|
+
(session & tick loop)
|
|
22
|
+
|
|
|
23
|
+
+----------+----------+
|
|
24
|
+
| | |
|
|
25
|
+
observer planner executor
|
|
26
|
+
(DOM snap) (heuristic (click/type/
|
|
27
|
+
/webllm) navigate...)
|
|
28
|
+
| | |
|
|
29
|
+
+----------+----------+
|
|
30
|
+
|
|
|
31
|
+
safety
|
|
32
|
+
(safe/review/blocked)
|
|
63
33
|
```
|
|
64
34
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
## How it works — one tick
|
|
35
|
+
### One tick
|
|
68
36
|
|
|
69
37
|
```
|
|
70
38
|
goal + history + memory
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
observer.collectSnapshot()
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
planner.planNextAction()
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
safety.assessRisk(action)
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
│
|
|
88
|
-
▼
|
|
89
|
-
executor.executeAction(action) ──→ result string
|
|
90
|
-
│
|
|
91
|
-
▼
|
|
92
|
-
session.history.push(result)
|
|
93
|
-
→ next tick
|
|
39
|
+
|
|
|
40
|
+
v
|
|
41
|
+
observer.collectSnapshot() --> PageSnapshot (url, title, candidates[])
|
|
42
|
+
|
|
|
43
|
+
v
|
|
44
|
+
planner.planNextAction() --> PlannerResult { action, evaluation?, memory?, nextGoal? }
|
|
45
|
+
|
|
|
46
|
+
v
|
|
47
|
+
safety.assessRisk(action) --> safe | review | blocked
|
|
48
|
+
|
|
|
49
|
+
blocked --> stop
|
|
50
|
+
review --> pause (human-approved) --> user calls resume()
|
|
51
|
+
safe --> executor.executeAction()
|
|
52
|
+
|
|
|
53
|
+
v
|
|
54
|
+
session.history.push(result) --> next tick
|
|
94
55
|
```
|
|
95
56
|
|
|
96
|
-
The planner uses a **reflection loop** before each action: it evaluates what happened last step, maintains working memory across steps, and states its next goal — giving the agent much better multi-step reasoning.
|
|
97
|
-
|
|
98
57
|
---
|
|
99
58
|
|
|
100
59
|
## Install
|
|
@@ -135,10 +94,10 @@ agent.stop();
|
|
|
135
94
|
|
|
136
95
|
## Planner modes
|
|
137
96
|
|
|
138
|
-
| Mode
|
|
139
|
-
|
|
97
|
+
| Mode | Description | When to use |
|
|
98
|
+
|-------------|-----------------------------------------------------|-----------------------------------------------|
|
|
140
99
|
| `heuristic` | Zero-dependency regex planner. Works fully offline. | Simple, predictable goals — navigate, fill, click |
|
|
141
|
-
| `webllm`
|
|
100
|
+
| `webllm` | On-device LLM via WebGPU. Fully private, no API calls. | Open-ended, multi-step, language-heavy goals |
|
|
142
101
|
|
|
143
102
|
### WebLLM with a custom system prompt
|
|
144
103
|
|
|
@@ -152,42 +111,44 @@ const agent = createBrowserAgent({
|
|
|
152
111
|
});
|
|
153
112
|
```
|
|
154
113
|
|
|
155
|
-
See [docs/EMBEDDING.md](docs/EMBEDDING.md) for the full WebLLM bridge wiring guide.
|
|
156
|
-
|
|
157
114
|
### Recommended WebLLM models
|
|
158
115
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
116
|
+
| Model ID | Size | Notes |
|
|
117
|
+
|----------|------|-------|
|
|
118
|
+
| `Llama-3.2-1B-Instruct-q4f16_1-MLC` | ~600 MB | fastest |
|
|
119
|
+
| `Llama-3.2-3B-Instruct-q4f16_1-MLC` | ~1.5 GB | fast |
|
|
120
|
+
| `Phi-3.5-mini-instruct-q4f16_1-MLC` | ~2 GB | quality |
|
|
121
|
+
| `Mistral-7B-Instruct-v0.3-q4f16_1-MLC` | ~4.1 GB | balanced |
|
|
122
|
+
| `Qwen2.5-7B-Instruct-q4f16_1-MLC` | ~4.3 GB | strong |
|
|
123
|
+
| `Llama-3.1-8B-Instruct-q4f16_1-MLC` | ~4.8 GB | strong |
|
|
124
|
+
| `Qwen3-8B-q4f16_1-MLC` | ~5 GB | latest Qwen |
|
|
125
|
+
| `gemma-2-9b-it-q4f16_1-MLC` | ~5.5 GB | Google Gemma |
|
|
126
|
+
| `DeepSeek-R1-Distill-Llama-8B-q4f16_1-MLC` | ~5 GB | reasoning |
|
|
127
|
+
| `Llama-3.1-70B-Instruct-q3f16_1-MLC` | ~35 GB | most capable (needs 24+ GB VRAM) |
|
|
167
128
|
|
|
168
129
|
---
|
|
169
130
|
|
|
170
131
|
## Agent modes
|
|
171
132
|
|
|
172
|
-
| Mode
|
|
173
|
-
|
|
174
|
-
| `autonomous`
|
|
133
|
+
| Mode | Behaviour |
|
|
134
|
+
|------------------|---------------------------------------------------------------------------|
|
|
135
|
+
| `autonomous` | All `safe` and `review` actions execute without pause |
|
|
175
136
|
| `human-approved` | `review`-rated actions pause and emit `onApprovalRequired` — call `resume()` to continue |
|
|
176
137
|
|
|
177
138
|
---
|
|
178
139
|
|
|
179
140
|
## Supported actions
|
|
180
141
|
|
|
181
|
-
| Action
|
|
182
|
-
|
|
183
|
-
| `navigate` | Navigate to a URL (http/https only) | safe
|
|
184
|
-
| `click`
|
|
185
|
-
| `type`
|
|
186
|
-
| `scroll`
|
|
187
|
-
| `focus`
|
|
188
|
-
| `wait`
|
|
189
|
-
| `extract`
|
|
190
|
-
| `done`
|
|
142
|
+
| Action | Description | Risk |
|
|
143
|
+
|------------|------------------------------------|----------------|
|
|
144
|
+
| `navigate` | Navigate to a URL (http/https only) | safe |
|
|
145
|
+
| `click` | Click an element by CSS selector | safe / review |
|
|
146
|
+
| `type` | Type text into an input | safe / review |
|
|
147
|
+
| `scroll` | Scroll a container or the page | safe |
|
|
148
|
+
| `focus` | Focus an element | safe |
|
|
149
|
+
| `wait` | Pause for N milliseconds | safe |
|
|
150
|
+
| `extract` | Extract text from an element | review |
|
|
151
|
+
| `done` | Signal task completion | safe |
|
|
191
152
|
|
|
192
153
|
---
|
|
193
154
|
|
|
@@ -203,19 +164,40 @@ controller.abort(); // cancel from outside
|
|
|
203
164
|
|
|
204
165
|
---
|
|
205
166
|
|
|
206
|
-
##
|
|
167
|
+
## WebLLM bridge wiring
|
|
207
168
|
|
|
208
|
-
|
|
169
|
+
```ts
|
|
170
|
+
import * as webllm from "@mlc-ai/web-llm";
|
|
171
|
+
import { createBrowserAgent, parsePlannerResult } from "@akshayram1/omnibrowser-agent";
|
|
172
|
+
|
|
173
|
+
const engine = await webllm.CreateMLCEngine("Phi-3.5-mini-instruct-q4f16_1-MLC");
|
|
174
|
+
|
|
175
|
+
window.__browserAgentWebLLM = {
|
|
176
|
+
async plan(input) {
|
|
177
|
+
const { goal, history, lastError, memory, systemPrompt } = input;
|
|
178
|
+
const resp = await engine.chat.completions.create({
|
|
179
|
+
messages: [
|
|
180
|
+
{ role: "system", content: systemPrompt || "You are a browser automation agent. Output only JSON." },
|
|
181
|
+
{ role: "user", content: `Goal: "${goal}"\nHistory: ${history.slice(-4).join(" -> ")}${memory ? "\nMemory: " + memory : ""}${lastError ? "\nLast error: " + lastError : ""}` }
|
|
182
|
+
],
|
|
183
|
+
temperature: 0,
|
|
184
|
+
max_tokens: 200
|
|
185
|
+
});
|
|
186
|
+
return parsePlannerResult(resp.choices[0].message.content);
|
|
187
|
+
}
|
|
188
|
+
};
|
|
209
189
|
|
|
210
|
-
|
|
211
|
-
|
|
190
|
+
const agent = createBrowserAgent({ goal: "Fill the checkout form", planner: { kind: "webllm" } });
|
|
191
|
+
await agent.start();
|
|
212
192
|
```
|
|
213
193
|
|
|
214
|
-
|
|
194
|
+
---
|
|
215
195
|
|
|
216
|
-
|
|
196
|
+
## Chrome Extension
|
|
217
197
|
|
|
218
|
-
|
|
198
|
+
1. `npm run build`
|
|
199
|
+
2. Open `chrome://extensions`, enable **Developer Mode**, click **Load unpacked**, select `dist/`.
|
|
200
|
+
3. Open any tab, enter a goal in the popup, pick a mode, and click **Start**.
|
|
219
201
|
|
|
220
202
|
---
|
|
221
203
|
|
|
@@ -226,68 +208,13 @@ src/
|
|
|
226
208
|
├── background/ Extension service worker — session management
|
|
227
209
|
├── content/ Extension content script — runs in page context
|
|
228
210
|
├── core/ Shared engine (planner, observer, executor)
|
|
229
|
-
|
|
230
|
-
│ ├── observer.ts
|
|
231
|
-
│ └── executor.ts
|
|
232
|
-
├── lib/ npm library entry — BrowserAgent class
|
|
233
|
-
│ └── index.ts
|
|
211
|
+
├── lib/ npm library entry — createBrowserAgent()
|
|
234
212
|
├── popup/ Extension popup UI
|
|
235
213
|
└── shared/ Types, safety, and parse utilities
|
|
236
|
-
├── contracts.ts
|
|
237
|
-
├── safety.ts
|
|
238
|
-
└── parse-action.ts
|
|
239
214
|
```
|
|
240
215
|
|
|
241
216
|
---
|
|
242
217
|
|
|
243
|
-
## Changelog
|
|
244
|
-
|
|
245
|
-
### v0.2.6
|
|
246
|
-
|
|
247
|
-
- Reflection-before-action pattern (`evaluation → memory → next_goal → action`) — agent reasons about each step before acting
|
|
248
|
-
- Working memory carried across ticks for better multi-step goals
|
|
249
|
-
- `parsePlannerResult()` exported from the library
|
|
250
|
-
- `systemPrompt` option in `PlannerConfig` — pass your own prompt without rewriting the bridge
|
|
251
|
-
- Thought bubble (💭) messages in the live demo chat showing the agent's next intent
|
|
252
|
-
|
|
253
|
-
### v0.2.4 — v0.2.5
|
|
254
|
-
|
|
255
|
-
- CI pipeline: auto version bump on push to main
|
|
256
|
-
- Removed page-agent dependency — reflection pattern implemented natively
|
|
257
|
-
- Chatbot demo redesign: right-aligned user messages, typing indicator, tab navigation (CRM + Task Manager)
|
|
258
|
-
- `parsePlannerResult()` and `PlannerResult` type exported from library
|
|
259
|
-
|
|
260
|
-
### v0.2.2
|
|
261
|
-
|
|
262
|
-
- SDK/extension separation: core logic in `src/core/` shared between extension and npm library
|
|
263
|
-
- 22 unit tests across planner and safety modules
|
|
264
|
-
- Action verification in executor (disabled-check, value-verify, empty-check)
|
|
265
|
-
- `CandidateElement.label` from associated `<label>` elements
|
|
266
|
-
- Retry loop with `lastError` fed back to planner on failure
|
|
267
|
-
|
|
268
|
-
### v0.2.0
|
|
269
|
-
|
|
270
|
-
- New actions: `scroll` and `focus`
|
|
271
|
-
- Smarter safety: risk assessment checks element label/text
|
|
272
|
-
- Improved heuristic planner with regex pattern matching
|
|
273
|
-
- Better page observation: filters invisible elements, up to 60 candidates
|
|
274
|
-
- Library API: `resume()`, `isRunning`, `hasPendingAction`, `onMaxStepsReached`, `AbortSignal`
|
|
275
|
-
|
|
276
|
-
### v0.1.0
|
|
277
|
-
|
|
278
|
-
- Extension runtime loop, shared action contracts, heuristic + WebLLM planner, human-approved mode
|
|
279
|
-
|
|
280
|
-
---
|
|
281
|
-
|
|
282
|
-
## Docs
|
|
283
|
-
|
|
284
|
-
- [Embedding Guide](docs/EMBEDDING.md) — integrate into any web app
|
|
285
|
-
- [Architecture](docs/arch.md) — layer-by-layer breakdown
|
|
286
|
-
- [Deployment](docs/DEPLOYMENT.md) — npm publish, Vercel, Chrome extension, CI
|
|
287
|
-
- [Roadmap](docs/ROADMAP.md) — planned features
|
|
288
|
-
|
|
289
|
-
---
|
|
290
|
-
|
|
291
218
|
## License
|
|
292
219
|
|
|
293
220
|
MIT © Akshay Chame
|