@akshayram1/omnibrowser-agent 0.2.3 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +210 -135
- package/dist/content.js +26 -24
- package/dist/content.js.map +2 -2
- package/dist/lib.js +61 -39
- package/dist/lib.js.map +2 -2
- package/dist/manifest.json +1 -1
- package/dist/popup.html +0 -1
- package/dist/types/core/planner.d.ts +2 -2
- package/dist/types/lib/index.d.ts +2 -2
- package/dist/types/shared/contracts.d.ts +31 -3
- package/dist/types/shared/parse-action.d.ts +12 -8
- package/docs/ARCHITECTURE.md +6 -14
- package/docs/EMBEDDING.md +21 -42
- package/docs/ROADMAP.md +0 -1
- package/docs/arch.md +220 -0
- package/index.html +275 -204
- package/package.json +1 -1
- package/styles.css +5 -0
package/index.html
CHANGED
|
@@ -3,11 +3,8 @@
|
|
|
3
3
|
<head>
|
|
4
4
|
<meta charset="UTF-8" />
|
|
5
5
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
6
|
-
<title>OmniBrowser Agent</title>
|
|
7
|
-
<meta
|
|
8
|
-
name="description"
|
|
9
|
-
content="OmniBrowser Agent - local-first browser AI operator library."
|
|
10
|
-
/>
|
|
6
|
+
<title>OmniBrowser Agent — Local-first Browser AI</title>
|
|
7
|
+
<meta name="description" content="OmniBrowser Agent — local-first browser AI operator. No API keys. No cloud. Runs entirely in the browser via WebLLM + WebGPU." />
|
|
11
8
|
<link rel="stylesheet" href="./styles.css" />
|
|
12
9
|
</head>
|
|
13
10
|
<body>
|
|
@@ -16,6 +13,7 @@
|
|
|
16
13
|
<a class="brand" href="#home">OmniBrowser Agent</a>
|
|
17
14
|
<nav class="nav">
|
|
18
15
|
<a href="#home">Home</a>
|
|
16
|
+
<a href="#whats-new">What's New</a>
|
|
19
17
|
<a href="#docs">Docs</a>
|
|
20
18
|
<a href="#architecture">Architecture</a>
|
|
21
19
|
<a href="#embedding">Embedding</a>
|
|
@@ -29,37 +27,29 @@
|
|
|
29
27
|
<!-- HOME -->
|
|
30
28
|
<section id="home" class="section hero">
|
|
31
29
|
<div class="wrap">
|
|
32
|
-
<p class="eyebrow">Open-source browser automation SDK</p>
|
|
30
|
+
<p class="eyebrow">Open-source browser automation SDK · v0.2.6</p>
|
|
33
31
|
<h1>Local-first browser AI automation library</h1>
|
|
34
32
|
<p>
|
|
35
|
-
OmniBrowser Agent
|
|
33
|
+
OmniBrowser Agent plans and executes DOM actions entirely in the browser — no API keys, no cloud costs, no data leaving your machine.
|
|
34
|
+
Wire in a WebLLM model and it reasons, remembers, and acts on any webpage.
|
|
36
35
|
</p>
|
|
37
36
|
<div class="chips">
|
|
38
37
|
<span>Privacy-first</span>
|
|
39
|
-
<span>WebLLM
|
|
38
|
+
<span>WebLLM + WebGPU</span>
|
|
39
|
+
<span>Reflection loop</span>
|
|
40
40
|
<span>Human-approved mode</span>
|
|
41
|
+
<span>Custom system prompt</span>
|
|
41
42
|
<span>Embeddable API</span>
|
|
42
43
|
</div>
|
|
43
44
|
<div class="actions">
|
|
44
45
|
<a class="btn primary" href="./examples/chatbot/">Live Demo</a>
|
|
45
|
-
<a
|
|
46
|
-
|
|
47
|
-
href="https://www.npmjs.com/package/@akshayram1/omnibrowser-agent"
|
|
48
|
-
target="_blank"
|
|
49
|
-
rel="noreferrer"
|
|
50
|
-
>NPM Package</a
|
|
51
|
-
>
|
|
52
|
-
<a
|
|
53
|
-
class="btn"
|
|
54
|
-
href="https://github.com/akshayram1/omnibrowser-agent"
|
|
55
|
-
target="_blank"
|
|
56
|
-
rel="noreferrer"
|
|
57
|
-
>GitHub Repo</a
|
|
58
|
-
>
|
|
46
|
+
<a class="btn" href="https://www.npmjs.com/package/@akshayram1/omnibrowser-agent" target="_blank" rel="noreferrer">NPM Package</a>
|
|
47
|
+
<a class="btn" href="https://github.com/akshayram1/omnibrowser-agent" target="_blank" rel="noreferrer">GitHub</a>
|
|
59
48
|
</div>
|
|
60
49
|
<div class="stats" aria-label="project stats">
|
|
61
|
-
<div class="stat"><strong>2</strong><span>
|
|
62
|
-
<div class="stat"><strong>
|
|
50
|
+
<div class="stat"><strong>2</strong><span>Agent Modes</span></div>
|
|
51
|
+
<div class="stat"><strong>2</strong><span>Planner Modes</span></div>
|
|
52
|
+
<div class="stat"><strong>8</strong><span>Action Types</span></div>
|
|
63
53
|
<div class="stat"><strong>MIT</strong><span>License</span></div>
|
|
64
54
|
</div>
|
|
65
55
|
<div class="home-grid">
|
|
@@ -67,16 +57,18 @@
|
|
|
67
57
|
<h3>Use Cases</h3>
|
|
68
58
|
<ul>
|
|
69
59
|
<li>CRM profile lookup automation</li>
|
|
70
|
-
<li>Guided
|
|
60
|
+
<li>Guided form-filling workflows</li>
|
|
71
61
|
<li>Assisted data extraction flows</li>
|
|
62
|
+
<li>Multi-step task automation</li>
|
|
72
63
|
</ul>
|
|
73
64
|
</article>
|
|
74
65
|
<article class="card">
|
|
75
|
-
<h3>Core
|
|
66
|
+
<h3>Core Engine</h3>
|
|
76
67
|
<ul>
|
|
77
|
-
<li><strong>Observer:</strong>
|
|
78
|
-
<li><strong>Planner:</strong> next
|
|
79
|
-
<li><strong>
|
|
68
|
+
<li><strong>Observer:</strong> DOM snapshot + candidate elements</li>
|
|
69
|
+
<li><strong>Planner:</strong> reflection → next action</li>
|
|
70
|
+
<li><strong>Safety:</strong> safe / review / blocked gating</li>
|
|
71
|
+
<li><strong>Executor:</strong> DOM actions with framework compat</li>
|
|
80
72
|
</ul>
|
|
81
73
|
</article>
|
|
82
74
|
<article class="card">
|
|
@@ -84,19 +76,89 @@
|
|
|
84
76
|
<ul>
|
|
85
77
|
<li><a href="https://www.npmjs.com/package/@akshayram1/omnibrowser-agent" target="_blank" rel="noreferrer">NPM package</a></li>
|
|
86
78
|
<li><a href="https://github.com/akshayram1/omnibrowser-agent" target="_blank" rel="noreferrer">GitHub repository</a></li>
|
|
87
|
-
<li><a href="./
|
|
79
|
+
<li><a href="./examples/chatbot/" target="_blank">Live Demo</a></li>
|
|
88
80
|
</ul>
|
|
89
81
|
</article>
|
|
90
82
|
</div>
|
|
91
83
|
</div>
|
|
92
84
|
</section>
|
|
93
85
|
|
|
86
|
+
<!-- WHAT'S NEW -->
|
|
87
|
+
<section id="whats-new" class="section">
|
|
88
|
+
<div class="wrap">
|
|
89
|
+
<div class="surface">
|
|
90
|
+
<h2>What's New in v0.2.6</h2>
|
|
91
|
+
<p>This release implements the <strong>reflection-before-action pattern</strong> — the same loop used by leading browser agents — plus a new <code>systemPrompt</code> option so you can shape agent behaviour without rewriting the bridge.</p>
|
|
92
|
+
|
|
93
|
+
<h3>Reflection Loop <span class="badge new">New</span></h3>
|
|
94
|
+
<p>Before every action the agent now goes through a 4-step inner loop:</p>
|
|
95
|
+
<div class="docs-grid">
|
|
96
|
+
<article class="doc-card">
|
|
97
|
+
<h4>1 · Evaluate</h4>
|
|
98
|
+
<p>What happened in the previous step? Did it succeed? What changed on the page?</p>
|
|
99
|
+
</article>
|
|
100
|
+
<article class="doc-card">
|
|
101
|
+
<h4>2 · Remember</h4>
|
|
102
|
+
<p>What key facts should be carried into the next step? Selector mappings, field values, task state.</p>
|
|
103
|
+
</article>
|
|
104
|
+
<article class="doc-card">
|
|
105
|
+
<h4>3 · Plan</h4>
|
|
106
|
+
<p>State the next goal in plain English before choosing an action.</p>
|
|
107
|
+
</article>
|
|
108
|
+
<article class="doc-card">
|
|
109
|
+
<h4>4 · Act</h4>
|
|
110
|
+
<p>Output the specific DOM action: click, type, navigate, scroll, etc.</p>
|
|
111
|
+
</article>
|
|
112
|
+
</div>
|
|
113
|
+
|
|
114
|
+
<p>The WebLLM bridge now returns the full reflection object:</p>
|
|
115
|
+
<pre><code>{
|
|
116
|
+
"evaluation": "The name field was filled successfully.",
|
|
117
|
+
"memory": "Name=#name done. Next: fill email at #email.",
|
|
118
|
+
"next_goal": "Type the email address into #email",
|
|
119
|
+
"action": { "type": "type", "selector": "#email", "text": "jane@example.com", "clearFirst": true }
|
|
120
|
+
}</code></pre>
|
|
121
|
+
|
|
122
|
+
<p>The <code>nextGoal</code> field is surfaced in the live demo as a <strong>💭 thought bubble</strong> before each action, so you can follow the agent's reasoning in real time.</p>
|
|
123
|
+
|
|
124
|
+
<h3>Working Memory Across Steps <span class="badge new">New</span></h3>
|
|
125
|
+
<p>The agent's <code>memory</code> string is automatically carried forward from one tick to the next inside <code>AgentSession</code>. The planner receives it as <code>input.memory</code> and can update it each step — giving the agent a scratchpad across the whole task.</p>
|
|
126
|
+
|
|
127
|
+
<h3>Custom System Prompt <span class="badge new">New</span></h3>
|
|
128
|
+
<p>Pass your own system prompt directly in the planner config — no need to rewrite the bridge:</p>
|
|
129
|
+
<pre><code>const agent = createBrowserAgent({
|
|
130
|
+
goal: "Fill the checkout form",
|
|
131
|
+
planner: {
|
|
132
|
+
kind: "webllm",
|
|
133
|
+
systemPrompt: "You are a careful checkout assistant. Never submit before all required fields are filled."
|
|
134
|
+
}
|
|
135
|
+
});</code></pre>
|
|
136
|
+
|
|
137
|
+
<h3>New Exports <span class="badge new">New</span></h3>
|
|
138
|
+
<ul>
|
|
139
|
+
<li><code>parsePlannerResult(raw)</code> — parse the full reflection+action JSON from raw LLM output, with fallback to bare AgentAction for backward compatibility.</li>
|
|
140
|
+
<li><code>PlannerResult</code> type — <code>{ action, evaluation?, memory?, nextGoal? }</code></li>
|
|
141
|
+
</ul>
|
|
142
|
+
<pre><code>import { parsePlannerResult } from "@akshayram1/omnibrowser-agent";
|
|
143
|
+
|
|
144
|
+
const result = parsePlannerResult(llmRawOutput);
|
|
145
|
+
// result.action → AgentAction
|
|
146
|
+
// result.evaluation → string | undefined
|
|
147
|
+
// result.memory → string | undefined
|
|
148
|
+
// result.nextGoal → string | undefined</code></pre>
|
|
149
|
+
|
|
150
|
+
<h3>Backward Compatible</h3>
|
|
151
|
+
<p>Existing bridges that return a bare <code>AgentAction</code> object still work without any changes. The library normalises both formats automatically.</p>
|
|
152
|
+
</div>
|
|
153
|
+
</div>
|
|
154
|
+
</section>
|
|
155
|
+
|
|
94
156
|
<!-- DOCS / QUICK START -->
|
|
95
157
|
<section id="docs" class="section">
|
|
96
158
|
<div class="wrap">
|
|
97
159
|
<div class="surface">
|
|
98
160
|
<h2>Docs</h2>
|
|
99
|
-
<p>Everything you need to install,
|
|
161
|
+
<p>Everything you need to install, initialise, and run your first browser agent.</p>
|
|
100
162
|
|
|
101
163
|
<h3>Installation</h3>
|
|
102
164
|
<pre><code>npm install @akshayram1/omnibrowser-agent</code></pre>
|
|
@@ -107,86 +169,94 @@
|
|
|
107
169
|
const agent = createBrowserAgent(
|
|
108
170
|
{
|
|
109
171
|
goal: "Open CRM and find customer John Smith",
|
|
110
|
-
mode: "human-approved",
|
|
111
|
-
planner: { kind: "heuristic" }
|
|
172
|
+
mode: "human-approved", // or "autonomous"
|
|
173
|
+
planner: { kind: "heuristic" } // or "webllm"
|
|
112
174
|
},
|
|
113
175
|
{
|
|
114
|
-
onStep:
|
|
115
|
-
onApprovalRequired:
|
|
116
|
-
onDone:
|
|
117
|
-
|
|
176
|
+
onStep: (result, session) => console.log(result.message),
|
|
177
|
+
onApprovalRequired:(action, session) => console.log("Needs approval:", action),
|
|
178
|
+
onDone: (result, session) => console.log("Done:", result.message),
|
|
179
|
+
onError: (err, session) => console.error(err),
|
|
180
|
+
onMaxStepsReached: (session) => console.log("Max steps hit"),
|
|
118
181
|
}
|
|
119
182
|
);
|
|
120
183
|
|
|
121
184
|
await agent.start();
|
|
122
185
|
|
|
123
|
-
// Resume after approval:
|
|
186
|
+
// Resume after an approval prompt:
|
|
124
187
|
await agent.resume();
|
|
125
188
|
|
|
126
|
-
// Inspect state:
|
|
189
|
+
// Inspect state at any time:
|
|
127
190
|
console.log(agent.isRunning, agent.hasPendingAction);
|
|
128
191
|
|
|
129
192
|
// Stop:
|
|
130
193
|
agent.stop();</code></pre>
|
|
131
194
|
|
|
132
|
-
<h3>AbortSignal
|
|
195
|
+
<h3>AbortSignal Support</h3>
|
|
133
196
|
<pre><code>const controller = new AbortController();
|
|
134
197
|
const agent = createBrowserAgent({ goal: "...", signal: controller.signal });
|
|
135
198
|
agent.start();
|
|
136
199
|
|
|
137
|
-
//
|
|
138
|
-
controller.abort();</code></pre>
|
|
200
|
+
controller.abort(); // cancel from outside</code></pre>
|
|
139
201
|
|
|
140
|
-
<h3>
|
|
202
|
+
<h3>Reading Reflection Fields</h3>
|
|
203
|
+
<p>Every <code>onStep</code> result now includes optional reflection data from the planner:</p>
|
|
204
|
+
<pre><code>onStep(result, session) {
|
|
205
|
+
if (result.reflection?.nextGoal) {
|
|
206
|
+
console.log("Agent thinking:", result.reflection.nextGoal);
|
|
207
|
+
}
|
|
208
|
+
if (result.reflection?.memory) {
|
|
209
|
+
console.log("Agent memory:", result.reflection.memory);
|
|
210
|
+
}
|
|
211
|
+
console.log("Action:", result.message);
|
|
212
|
+
}</code></pre>
|
|
213
|
+
|
|
214
|
+
<h3>Agent Modes</h3>
|
|
141
215
|
<div class="docs-grid">
|
|
142
216
|
<article class="doc-card">
|
|
143
217
|
<h4>human-approved</h4>
|
|
144
|
-
<p>
|
|
218
|
+
<p>Pauses on review-rated actions and fires <code>onApprovalRequired</code>. Call <code>agent.resume()</code> to continue. Recommended for CRM, finance, and admin flows.</p>
|
|
145
219
|
</article>
|
|
146
220
|
<article class="doc-card">
|
|
147
221
|
<h4>autonomous</h4>
|
|
148
|
-
<p>
|
|
222
|
+
<p>Executes all safe and review actions without pausing. Best for rapid prototyping and demos.</p>
|
|
149
223
|
</article>
|
|
150
224
|
</div>
|
|
151
225
|
|
|
152
|
-
<h3>Planner
|
|
226
|
+
<h3>Planner Modes</h3>
|
|
153
227
|
<div class="docs-grid">
|
|
154
228
|
<article class="doc-card">
|
|
155
229
|
<h4>heuristic</h4>
|
|
156
|
-
<p>Zero-dependency regex
|
|
230
|
+
<p>Zero-dependency regex planner. Works fully offline. Best for simple, predictable goals: navigate, fill a field, click a button.</p>
|
|
157
231
|
</article>
|
|
158
232
|
<article class="doc-card">
|
|
159
233
|
<h4>webllm</h4>
|
|
160
|
-
<p>
|
|
161
|
-
</article>
|
|
162
|
-
<article class="doc-card">
|
|
163
|
-
<h4>page-agent</h4>
|
|
164
|
-
<p>Delegates to an <a href="https://github.com/alibaba/page-agent" target="_blank" rel="noreferrer">alibaba/page-agent</a> bridge (<code>window.__browserAgentPageAgent</code>). Best for complex multi-step goals.</p>
|
|
234
|
+
<p>On-device LLM via WebGPU through <code>window.__browserAgentWebLLM</code>. Fully private. Supports the reflection loop and custom system prompts.</p>
|
|
165
235
|
</article>
|
|
166
236
|
</div>
|
|
167
237
|
|
|
168
238
|
<h3>Supported Actions</h3>
|
|
169
239
|
<table>
|
|
170
240
|
<thead>
|
|
171
|
-
<tr><th>Action</th><th>Description</th></tr>
|
|
241
|
+
<tr><th>Action</th><th>Description</th><th>Risk level</th></tr>
|
|
172
242
|
</thead>
|
|
173
243
|
<tbody>
|
|
174
|
-
<tr><td><code>
|
|
175
|
-
<tr><td><code>
|
|
176
|
-
<tr><td><code>
|
|
177
|
-
<tr><td><code>
|
|
178
|
-
<tr><td><code>
|
|
179
|
-
<tr><td><code>
|
|
180
|
-
<tr><td><code>
|
|
181
|
-
<tr><td><code>done</code></td><td>Signal task completion</td></tr>
|
|
244
|
+
<tr><td><code>navigate</code></td><td>Navigate to a URL (http/https only)</td><td>safe</td></tr>
|
|
245
|
+
<tr><td><code>click</code></td><td>Click an element by CSS selector</td><td>safe / review</td></tr>
|
|
246
|
+
<tr><td><code>type</code></td><td>Type text into an input or textarea</td><td>safe / review</td></tr>
|
|
247
|
+
<tr><td><code>scroll</code></td><td>Scroll a container or the page</td><td>safe</td></tr>
|
|
248
|
+
<tr><td><code>focus</code></td><td>Focus an element (useful for dropdowns)</td><td>safe</td></tr>
|
|
249
|
+
<tr><td><code>wait</code></td><td>Pause for N milliseconds</td><td>safe</td></tr>
|
|
250
|
+
<tr><td><code>extract</code></td><td>Extract text from an element</td><td>review</td></tr>
|
|
251
|
+
<tr><td><code>done</code></td><td>Signal task completion</td><td>safe</td></tr>
|
|
182
252
|
</tbody>
|
|
183
253
|
</table>
|
|
184
254
|
|
|
185
|
-
<h3>Safety
|
|
255
|
+
<h3>Safety Model</h3>
|
|
186
256
|
<ul>
|
|
187
|
-
<li>
|
|
188
|
-
<li>
|
|
189
|
-
<li>
|
|
257
|
+
<li><strong>safe</strong> — executes immediately in all modes.</li>
|
|
258
|
+
<li><strong>review</strong> — pauses in <code>human-approved</code> mode; executes in <code>autonomous</code>. Triggered by actions on labels matching delete / submit / pay / confirm / transfer.</li>
|
|
259
|
+
<li><strong>blocked</strong> — never executes. Triggered by <code>javascript:</code>, <code>file:</code>, or malformed URLs.</li>
|
|
190
260
|
</ul>
|
|
191
261
|
</div>
|
|
192
262
|
</div>
|
|
@@ -197,92 +267,85 @@ controller.abort();</code></pre>
|
|
|
197
267
|
<div class="wrap">
|
|
198
268
|
<div class="surface">
|
|
199
269
|
<h2>Architecture</h2>
|
|
200
|
-
<p>
|
|
270
|
+
<p>OmniBrowser Agent is split into two delivery modes that share the same underlying engine. See the full breakdown in <a href="https://github.com/akshayram1/omnibrowser-agent/blob/main/docs/arch.md" target="_blank" rel="noreferrer">docs/arch.md</a>.</p>
|
|
201
271
|
|
|
202
|
-
<h3>
|
|
203
|
-
<
|
|
204
|
-
<
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
272
|
+
<h3>Delivery Layer</h3>
|
|
273
|
+
<div class="docs-grid">
|
|
274
|
+
<article class="doc-card">
|
|
275
|
+
<h4>🧩 Chrome Extension</h4>
|
|
276
|
+
<p>Popup UI + background service worker. Manages sessions per tab and drives the tick loop via <code>chrome.tabs.sendMessage</code>.</p>
|
|
277
|
+
</article>
|
|
278
|
+
<article class="doc-card">
|
|
279
|
+
<h4>📦 npm Library</h4>
|
|
280
|
+
<p><code>createBrowserAgent()</code> — runs the same tick loop in-process inside your web app. No extension required.</p>
|
|
281
|
+
</article>
|
|
282
|
+
</div>
|
|
209
283
|
|
|
210
|
-
<h3>
|
|
284
|
+
<h3>Core Modules <code>src/core/</code></h3>
|
|
211
285
|
<div class="docs-grid">
|
|
212
286
|
<article class="doc-card">
|
|
213
|
-
<h4>
|
|
214
|
-
<p>
|
|
287
|
+
<h4>observer.ts</h4>
|
|
288
|
+
<p>Queries all interactive elements, filters invisible ones, resolves accessible labels (<code>aria-label</code>, <code>for/id</code>, wrapping <code><label></code>), caps at 60 candidates. Returns <code>PageSnapshot</code>.</p>
|
|
215
289
|
</article>
|
|
216
290
|
<article class="doc-card">
|
|
217
|
-
<h4>
|
|
218
|
-
<p>
|
|
291
|
+
<h4>planner.ts</h4>
|
|
292
|
+
<p>Calls heuristic regex or the <code>window.__browserAgentWebLLM</code> bridge. Returns <code>PlannerResult</code> — action plus optional <code>evaluation</code>, <code>memory</code>, <code>nextGoal</code>.</p>
|
|
219
293
|
</article>
|
|
220
294
|
<article class="doc-card">
|
|
221
|
-
<h4>
|
|
222
|
-
<p
|
|
223
|
-
<strong>planner</strong> — next-action decision.<br>
|
|
224
|
-
<strong>safety</strong> — risk gating.<br>
|
|
225
|
-
<strong>executor</strong> — DOM action execution.</p>
|
|
295
|
+
<h4>executor.ts</h4>
|
|
296
|
+
<p>Performs DOM actions. Uses <code>InputEvent</code> with <code>bubbles: true</code> for React/Vue compat. Verifies element exists, is not disabled, and value updated. Throws on failure so the retry loop feeds <code>lastError</code> back.</p>
|
|
226
297
|
</article>
|
|
227
298
|
</div>
|
|
228
299
|
|
|
229
|
-
<h3>
|
|
230
|
-
<
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
300
|
+
<h3>Data Flow — One Tick</h3>
|
|
301
|
+
<pre><code>goal + history + memory
|
|
302
|
+
│
|
|
303
|
+
▼
|
|
304
|
+
observer.collectSnapshot() → PageSnapshot (url, title, candidates[])
|
|
305
|
+
│
|
|
306
|
+
▼
|
|
307
|
+
planner.planNextAction() → PlannerResult
|
|
308
|
+
{ action, evaluation?, memory?, nextGoal? }
|
|
309
|
+
│
|
|
310
|
+
▼
|
|
311
|
+
safety.assessRisk(action) → safe | review | blocked
|
|
312
|
+
│
|
|
313
|
+
┌────┴──────────────────────────┐
|
|
314
|
+
blocked review (human-approved mode)
|
|
315
|
+
│ │
|
|
316
|
+
stop pause → user approves → resume()
|
|
317
|
+
│
|
|
318
|
+
safe / approved
|
|
319
|
+
│
|
|
320
|
+
▼
|
|
321
|
+
executor.executeAction(action) → result string
|
|
322
|
+
│
|
|
323
|
+
▼
|
|
324
|
+
session.history.push(result)
|
|
325
|
+
session.memory = plannerResult.memory
|
|
326
|
+
→ next tick</code></pre>
|
|
327
|
+
|
|
328
|
+
<h3>WebLLM Bridge Contract</h3>
|
|
329
|
+
<p>Attach an object to <code>window.__browserAgentWebLLM</code> before starting the agent. The bridge can return either the new <code>PlannerResult</code> format or a bare <code>AgentAction</code> (backward compatible).</p>
|
|
257
330
|
<pre><code>window.__browserAgentWebLLM = {
|
|
258
331
|
async plan(input, modelId) {
|
|
259
|
-
//
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
const pa = new PageAgent({
|
|
268
|
-
baseURL: "https://api.openai.com/v1",
|
|
269
|
-
model: "gpt-4o",
|
|
270
|
-
apiKey: "sk-..."
|
|
271
|
-
});
|
|
272
|
-
|
|
273
|
-
window.__browserAgentPageAgent = {
|
|
274
|
-
async plan(input) {
|
|
275
|
-
const result = await pa.execute(input.goal);
|
|
276
|
-
return { type: "done", reason: result.data };
|
|
332
|
+
// input.goal, input.snapshot, input.history,
|
|
333
|
+
// input.lastError, input.memory, input.systemPrompt
|
|
334
|
+
return {
|
|
335
|
+
evaluation: "Previous step succeeded.",
|
|
336
|
+
memory: "Name field is #name.",
|
|
337
|
+
next_goal: "Fill the email field.",
|
|
338
|
+
action: { "type": "type", "selector": "#email", "text": "jane@example.com", "clearFirst": true }
|
|
339
|
+
};
|
|
277
340
|
}
|
|
278
341
|
};</code></pre>
|
|
279
342
|
|
|
280
343
|
<h3>Current Limitations</h3>
|
|
281
344
|
<ul>
|
|
282
|
-
<li>No persistent long-term memory yet</li>
|
|
283
|
-
<li>No
|
|
284
|
-
<li>Risk scoring is
|
|
285
|
-
<li>No
|
|
345
|
+
<li>No persistent long-term memory (IndexedDB) yet</li>
|
|
346
|
+
<li>No goal decomposition / multi-step task graphs yet</li>
|
|
347
|
+
<li>Risk scoring is keyword-based, not semantic</li>
|
|
348
|
+
<li>No selector healing or fallback strategy yet</li>
|
|
286
349
|
</ul>
|
|
287
350
|
</div>
|
|
288
351
|
</div>
|
|
@@ -293,12 +356,12 @@ window.__browserAgentPageAgent = {
|
|
|
293
356
|
<div class="wrap">
|
|
294
357
|
<div class="surface">
|
|
295
358
|
<h2>Embedding Guide</h2>
|
|
296
|
-
<p>
|
|
359
|
+
<p>Embed OmniBrowser Agent as a library in any web application. Full reference in <a href="https://github.com/akshayram1/omnibrowser-agent/blob/main/docs/EMBEDDING.md" target="_blank" rel="noreferrer">docs/EMBEDDING.md</a>.</p>
|
|
297
360
|
|
|
298
361
|
<h3>Install</h3>
|
|
299
362
|
<pre><code>npm install @akshayram1/omnibrowser-agent</code></pre>
|
|
300
363
|
|
|
301
|
-
<h3>
|
|
364
|
+
<h3>Heuristic Planner (zero setup)</h3>
|
|
302
365
|
<pre><code>import { createBrowserAgent } from "@akshayram1/omnibrowser-agent";
|
|
303
366
|
|
|
304
367
|
const agent = createBrowserAgent(
|
|
@@ -310,65 +373,79 @@ const agent = createBrowserAgent(
|
|
|
310
373
|
stepDelayMs: 400
|
|
311
374
|
},
|
|
312
375
|
{
|
|
313
|
-
onStep:
|
|
314
|
-
onApprovalRequired: (action) =>
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
},
|
|
318
|
-
onDone: (result) => console.log("done", result),
|
|
319
|
-
onError: (error) => console.error(error)
|
|
376
|
+
onStep: (result) => console.log("step", result),
|
|
377
|
+
onApprovalRequired: (action) => showApprovalModal(action),
|
|
378
|
+
onDone: (result) => console.log("done", result),
|
|
379
|
+
onError: (error) => console.error(error)
|
|
320
380
|
}
|
|
321
381
|
);
|
|
322
382
|
|
|
323
|
-
await agent.start()
|
|
383
|
+
await agent.start();
|
|
324
384
|
|
|
325
|
-
|
|
326
|
-
|
|
385
|
+
// Approve a paused action:
|
|
386
|
+
await agent.approvePendingAction();
|
|
327
387
|
|
|
328
|
-
|
|
329
|
-
|
|
388
|
+
// Stop at any time:
|
|
389
|
+
agent.stop();</code></pre>
|
|
330
390
|
|
|
331
|
-
<h3>WebLLM
|
|
332
|
-
<p>
|
|
333
|
-
<pre><code>
|
|
391
|
+
<h3>WebLLM Planner with Reflection</h3>
|
|
392
|
+
<p>Load a WebLLM engine, wire the bridge, then start the agent. The bridge receives the full reflection input and should return the reflection+action object:</p>
|
|
393
|
+
<pre><code>import * as webllm from "@mlc-ai/web-llm";
|
|
394
|
+
import { createBrowserAgent, parsePlannerResult } from "@akshayram1/omnibrowser-agent";
|
|
395
|
+
|
|
396
|
+
const engine = await webllm.CreateMLCEngine("Llama-3.2-3B-Instruct-q4f16_1-MLC");
|
|
397
|
+
|
|
398
|
+
window.__browserAgentWebLLM = {
|
|
334
399
|
async plan(input, modelId) {
|
|
335
|
-
|
|
336
|
-
|
|
400
|
+
const { goal, history, lastError, memory, systemPrompt } = input;
|
|
401
|
+
|
|
402
|
+
const defaultSystem = `You are a browser automation agent.
|
|
403
|
+
Output ONLY a JSON object in this format:
|
|
404
|
+
{"evaluation":"...","memory":"...","next_goal":"...","action":{...}}`;
|
|
405
|
+
|
|
406
|
+
const resp = await engine.chat.completions.create({
|
|
407
|
+
messages: [
|
|
408
|
+
{ role: "system", content: systemPrompt || defaultSystem },
|
|
409
|
+
{ role: "user", content: `Goal: "${goal}"\nHistory: ${history.slice(-4).join(" → ")}${memory ? "\nMemory: " + memory : ""}${lastError ? "\nLast error: " + lastError : ""}` }
|
|
410
|
+
],
|
|
411
|
+
temperature: 0,
|
|
412
|
+
max_tokens: 200
|
|
413
|
+
});
|
|
414
|
+
|
|
415
|
+
return parsePlannerResult(resp.choices[0].message.content);
|
|
337
416
|
}
|
|
338
417
|
};
|
|
339
418
|
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
<pre><code>npm install page-agent</code></pre>
|
|
349
|
-
<pre><code>import { PageAgent } from "page-agent";
|
|
350
|
-
|
|
351
|
-
const pa = new PageAgent({
|
|
352
|
-
baseURL: "https://api.openai.com/v1",
|
|
353
|
-
model: "gpt-4o",
|
|
354
|
-
apiKey: "sk-..."
|
|
419
|
+
const agent = createBrowserAgent({
|
|
420
|
+
goal: "Fill the checkout form with my details",
|
|
421
|
+
planner: { kind: "webllm" }
|
|
422
|
+
}, {
|
|
423
|
+
onStep(result) {
|
|
424
|
+
if (result.reflection?.nextGoal) console.log("💭", result.reflection.nextGoal);
|
|
425
|
+
console.log("✅", result.message);
|
|
426
|
+
}
|
|
355
427
|
});
|
|
356
428
|
|
|
357
|
-
|
|
358
|
-
async plan(input) {
|
|
359
|
-
const result = await pa.execute(input.goal);
|
|
360
|
-
return { type: "done", reason: result.data };
|
|
361
|
-
}
|
|
362
|
-
};
|
|
429
|
+
await agent.start();</code></pre>
|
|
363
430
|
|
|
364
|
-
|
|
365
|
-
|
|
431
|
+
<h3>Custom System Prompt</h3>
|
|
432
|
+
<p>Shape the agent's personality or constraints without touching the bridge:</p>
|
|
433
|
+
<pre><code>const agent = createBrowserAgent({
|
|
434
|
+
goal: "Book a meeting room for tomorrow",
|
|
435
|
+
planner: {
|
|
436
|
+
kind: "webllm",
|
|
437
|
+
systemPrompt: `You are a careful meeting room booking assistant.
|
|
438
|
+
Always confirm the room is available before clicking Book.
|
|
439
|
+
Never navigate away from the booking portal.`
|
|
440
|
+
}
|
|
441
|
+
});</code></pre>
|
|
366
442
|
|
|
367
443
|
<h3>Notes</h3>
|
|
368
444
|
<ul>
|
|
369
|
-
<li>
|
|
370
|
-
<li
|
|
371
|
-
<li
|
|
445
|
+
<li>The WebLLM bridge is not bundled — bring your own engine and attach it to <code>window.__browserAgentWebLLM</code>.</li>
|
|
446
|
+
<li>Use <code>human-approved</code> mode for CRM, finance, and admin actions.</li>
|
|
447
|
+
<li>Bridges returning a bare <code>AgentAction</code> still work — backward compatible.</li>
|
|
448
|
+
<li>For production apps, mount inside an authenticated shell and add your own permission checks.</li>
|
|
372
449
|
</ul>
|
|
373
450
|
</div>
|
|
374
451
|
</div>
|
|
@@ -379,6 +456,7 @@ planner: { kind: "page-agent" }</code></pre>
|
|
|
379
456
|
<div class="wrap">
|
|
380
457
|
<div class="surface">
|
|
381
458
|
<h2>Roadmap</h2>
|
|
459
|
+
<p>Full roadmap in <a href="https://github.com/akshayram1/omnibrowser-agent/blob/main/docs/ROADMAP.md" target="_blank" rel="noreferrer">docs/ROADMAP.md</a>.</p>
|
|
382
460
|
|
|
383
461
|
<h3>v0.1</h3>
|
|
384
462
|
<ul>
|
|
@@ -388,21 +466,30 @@ planner: { kind: "page-agent" }</code></pre>
|
|
|
388
466
|
<li>Human-approved mode</li>
|
|
389
467
|
</ul>
|
|
390
468
|
|
|
391
|
-
<h3>v0.2 <span class="badge">
|
|
469
|
+
<h3>v0.2 <span class="badge">stable</span></h3>
|
|
392
470
|
<ul>
|
|
393
471
|
<li>New actions: <code>scroll</code>, <code>focus</code></li>
|
|
394
472
|
<li>Improved heuristic planner with regex goal patterns</li>
|
|
395
|
-
<li>Better page observation (visibility filtering,
|
|
473
|
+
<li>Better page observation (visibility filtering, up to 60 candidates)</li>
|
|
396
474
|
<li>Library API: <code>resume()</code>, <code>isRunning</code>, <code>hasPendingAction</code>, <code>AbortSignal</code>, <code>onMaxStepsReached</code></li>
|
|
397
|
-
<li
|
|
475
|
+
<li>CI pipeline with auto version bump on push to main</li>
|
|
476
|
+
</ul>
|
|
477
|
+
|
|
478
|
+
<h3>v0.2.6 <span class="badge new">current</span></h3>
|
|
479
|
+
<ul>
|
|
480
|
+
<li>Reflection-before-action pattern (<code>evaluation → memory → next_goal → act</code>)</li>
|
|
481
|
+
<li>Working memory carried across ticks via <code>AgentSession.memory</code></li>
|
|
482
|
+
<li><code>parsePlannerResult()</code> exported from library</li>
|
|
483
|
+
<li><code>systemPrompt</code> option in <code>PlannerConfig</code></li>
|
|
484
|
+
<li>Thought bubble (💭) messages in live demo</li>
|
|
485
|
+
<li>Chatbot UI redesign: tabs, typing indicator, right-aligned messages</li>
|
|
398
486
|
</ul>
|
|
399
487
|
|
|
400
488
|
<h3>v0.3</h3>
|
|
401
489
|
<ul>
|
|
402
490
|
<li>Site profile and policy engine (allowlist, blocked domains)</li>
|
|
403
491
|
<li>Selector healing and fallback strategy</li>
|
|
404
|
-
<li>Session
|
|
405
|
-
<li>Drupal CRM starter skills</li>
|
|
492
|
+
<li>Session replay log</li>
|
|
406
493
|
</ul>
|
|
407
494
|
|
|
408
495
|
<h3>v1.0</h3>
|
|
@@ -425,27 +512,11 @@ planner: { kind: "page-agent" }</code></pre>
|
|
|
425
512
|
<h2>Contact</h2>
|
|
426
513
|
<p>Maintainer: Akshay Chame</p>
|
|
427
514
|
<ul>
|
|
428
|
-
<li>
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
</li>
|
|
432
|
-
<li>
|
|
433
|
-
GitHub:
|
|
434
|
-
<a href="https://github.com/akshayram1" target="_blank" rel="noreferrer">@akshayram1</a>
|
|
435
|
-
</li>
|
|
436
|
-
<li>
|
|
437
|
-
Package:
|
|
438
|
-
<a
|
|
439
|
-
href="https://www.npmjs.com/package/@akshayram1/omnibrowser-agent"
|
|
440
|
-
target="_blank"
|
|
441
|
-
rel="noreferrer"
|
|
442
|
-
>@akshayram1/omnibrowser-agent</a
|
|
443
|
-
>
|
|
444
|
-
</li>
|
|
515
|
+
<li>Email: <a href="mailto:akshaychame2@gmail.com">akshaychame2@gmail.com</a></li>
|
|
516
|
+
<li>GitHub: <a href="https://github.com/akshayram1" target="_blank" rel="noreferrer">@akshayram1</a></li>
|
|
517
|
+
<li>Package: <a href="https://www.npmjs.com/package/@akshayram1/omnibrowser-agent" target="_blank" rel="noreferrer">@akshayram1/omnibrowser-agent</a></li>
|
|
445
518
|
</ul>
|
|
446
|
-
<p class="contact-note">
|
|
447
|
-
For feature requests or bugs, please open an issue on GitHub with reproduction steps.
|
|
448
|
-
</p>
|
|
519
|
+
<p class="contact-note">For feature requests or bugs, please open an issue on GitHub with reproduction steps.</p>
|
|
449
520
|
</div>
|
|
450
521
|
</div>
|
|
451
522
|
</section>
|
|
@@ -453,7 +524,7 @@ planner: { kind: "page-agent" }</code></pre>
|
|
|
453
524
|
|
|
454
525
|
<footer class="footer">
|
|
455
526
|
<div class="wrap">
|
|
456
|
-
<p>© 2026 OmniBrowser Agent · MIT License</p>
|
|
527
|
+
<p>© 2026 OmniBrowser Agent · MIT License · <a href="https://github.com/akshayram1/omnibrowser-agent" target="_blank" rel="noreferrer">GitHub</a></p>
|
|
457
528
|
</div>
|
|
458
529
|
</footer>
|
|
459
530
|
</body>
|