@akshayram1/omnibrowser-agent 0.2.6 → 0.2.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,183 +1,292 @@
1
1
  # omnibrowser-agent
2
2
 
3
+ [![npm](https://img.shields.io/npm/v/@akshayram1/omnibrowser-agent)](https://www.npmjs.com/package/@akshayram1/omnibrowser-agent)
3
4
  [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
4
- [![Version](https://img.shields.io/badge/version-0.2.2-green.svg)](package.json)
5
5
 
6
- Local-first open-source browser AI operator using in-browser planning and page actions.
7
-
8
- ## Why this project
9
-
10
- - Privacy-first: run agent logic in browser
11
- - No per-request cloud token costs
12
- - Dual delivery:
13
- - Browser extension mode
14
- - Embeddable library mode for web apps
15
- - Hybrid control modes:
16
- - Autonomous
17
- - Human-approved
18
-
19
- ## Stack
20
-
21
- - MV3 browser extension runtime
22
- - TypeScript + esbuild
23
- - Pluggable planner bridges: WebLLM (local, in-browser)
6
+ Local-first browser AI operator. Plans and executes DOM actions entirely in the browser no API keys, no cloud costs, no data leaving your machine.
7
+
8
+ [Live Demo](https://omnibrowser-agent.vercel.app/examples/chatbot/) · [Embedding Guide](docs/EMBEDDING.md) · [Architecture](docs/arch.md) · [Deployment](docs/DEPLOYMENT.md) · [Roadmap](docs/ROADMAP.md)
9
+
10
+ ---
11
+
12
+ ## Architecture
13
+
14
+ ```mermaid
15
+ flowchart TB
16
+ subgraph DELIVERY["Delivery Layer"]
17
+ EXT["🧩 Chrome Extension\npopup + background worker"]
18
+ LIB["📦 npm Library\ncreateBrowserAgent()"]
19
+ end
20
+
21
+ subgraph ORCHESTRATION["Orchestration"]
22
+ BG["background/index.ts\nSession & tick loop"]
23
+ BA["BrowserAgent class\nrunLoop() / resume() / stop()"]
24
+ end
25
+
26
+ subgraph CORE["Core (src/core/)"]
27
+ PL["planner.ts\nheuristicPlan() / webllm bridge\nplanNextAction()"]
28
+ OB["observer.ts\ncollectSnapshot()\nDOM candidates + visibility filter"]
29
+ EX["executor.ts\nexecuteAction()\nclick / type / navigate\nscroll / focus / wait"]
30
+ end
31
+
32
+ subgraph SHARED["Shared (src/shared/)"]
33
+ CT["contracts.ts\nAgentAction · PageSnapshot\nAgentSession · PlannerResult"]
34
+ SF["safety.ts\nassessRisk()\nsafe / review / blocked"]
35
+ PA["parse-action.ts\nparseAction()\nparsePlannerResult()"]
36
+ end
37
+
38
+ subgraph OUTCOMES["Action Outcomes"]
39
+ direction LR
40
+ OK["✅ safe → execute"]
41
+ RV["⚠️ review → needs approval"]
42
+ BL["🚫 blocked → stop"]
43
+ end
44
+
45
+ subgraph PLANNERS["Planner Modes"]
46
+ direction LR
47
+ HP["Heuristic\nzero deps · offline\nregex patterns"]
48
+ WL["WebLLM\non-device · WebGPU\nwindow.__browserAgentWebLLM"]
49
+ end
50
+
51
+ EXT --> BG
52
+ LIB --> BA
53
+ BG -. "chrome.tabs.sendMessage" .-> CORE
54
+ BA --> CORE
55
+
56
+ PL --> OB
57
+ PL --> SHARED
58
+ OB --> SHARED
59
+ EX --> SHARED
60
+
61
+ SF --> OUTCOMES
62
+ PL --> PLANNERS
63
+ ```
24
64
 
25
- ## Project structure
65
+ ---
26
66
 
27
- - `src/background` session orchestration
28
- - `src/content` page observer/planner/executor
29
- - `src/popup` control panel
30
- - `src/lib` embeddable runtime API
31
- - `src/shared` contracts and safety
67
+ ## How it works — one tick
32
68
 
33
- ## Quick start
69
+ ```
70
+ goal + history + memory
71
+
72
+
73
+ observer.collectSnapshot() ──→ PageSnapshot (url, title, candidates[])
74
+
75
+
76
+ planner.planNextAction() ──→ PlannerResult { action, evaluation?, memory?, nextGoal? }
77
+
78
+
79
+ safety.assessRisk(action) ──→ safe | review | blocked
80
+
81
+ ┌────┴─────────────────────┐
82
+ blocked review (human-approved mode)
83
+ │ │
84
+ stop pause → user approves → resume
85
+
86
+ safe / approved
87
+
88
+
89
+ executor.executeAction(action) ──→ result string
90
+
91
+
92
+ session.history.push(result)
93
+ → next tick
94
+ ```
34
95
 
35
- 1. Install dependencies:
96
+ The planner uses a **reflection loop** before each action: it evaluates what happened last step, maintains working memory across steps, and states its next goal — giving the agent much better multi-step reasoning.
36
97
 
37
- ```bash
38
- npm install
39
- ```
98
+ ---
40
99
 
41
- 2. Build extension:
100
+ ## Install
42
101
 
43
102
  ```bash
44
- npm run build
103
+ npm install @akshayram1/omnibrowser-agent
45
104
  ```
46
105
 
47
- 3. Load extension in Chromium:
48
-
49
- - Open `chrome://extensions`
50
- - Enable Developer Mode
51
- - Click **Load unpacked**
52
- - Select `dist`
106
+ ---
53
107
 
54
- ## How to use
55
-
56
- 1. Open a target website tab
57
- 2. Open extension popup
58
- 3. Enter goal (for example: `search contact John Doe in CRM and open profile`)
59
- 4. Select mode/planner
60
- 5. Click Start
61
- 6. If mode is `human-approved`, click **Approve pending action** on review steps
62
-
63
- ## Use as a web library
108
+ ## Quick start
64
109
 
65
110
  ```ts
66
111
  import { createBrowserAgent } from "@akshayram1/omnibrowser-agent";
67
112
 
68
113
  const agent = createBrowserAgent({
69
- goal: "Open CRM and find customer John Smith",
70
- mode: "human-approved",
71
- planner: { kind: "heuristic" }
114
+ goal: "Search for contact Jane Doe and open her profile",
115
+ mode: "human-approved", // or "autonomous"
116
+ planner: { kind: "heuristic" } // or "webllm"
72
117
  }, {
73
- onStep: (result) => console.log(result.message),
74
- onApprovalRequired: (action) => console.log("Needs approval:", action),
75
- onDone: (result) => console.log("Done:", result.message),
76
- onMaxStepsReached: (session) => console.log("Max steps hit", session.history)
118
+ onStep: (result, session) => console.log(result.message),
119
+ onApprovalRequired: (action, session) => console.log("Review:", action),
120
+ onDone: (result, session) => console.log("Done:", result.message),
121
+ onError: (err, session) => console.error(err),
122
+ onMaxStepsReached: (session) => console.log("Max steps hit"),
77
123
  });
78
124
 
79
125
  await agent.start();
80
126
 
81
- // Resume after approval:
127
+ // After onApprovalRequired fires:
82
128
  await agent.resume();
83
129
 
84
- // Inspect state at any time:
85
- console.log(agent.isRunning, agent.hasPendingAction);
86
-
87
- // Stop at any time:
130
+ // Cancel at any time:
88
131
  agent.stop();
89
132
  ```
90
133
 
91
- ### Supported actions
134
+ ---
135
+
136
+ ## Planner modes
137
+
138
+ | Mode | Description | When to use |
139
+ |---|---|---|
140
+ | `heuristic` | Zero-dependency regex planner. Works fully offline. | Simple, predictable goals — navigate, fill, click |
141
+ | `webllm` | On-device LLM via WebGPU. Fully private, no API calls. | Open-ended, multi-step, language-heavy goals |
142
+
143
+ ### WebLLM with a custom system prompt
144
+
145
+ ```ts
146
+ const agent = createBrowserAgent({
147
+ goal: "Fill the checkout form",
148
+ planner: {
149
+ kind: "webllm",
150
+ systemPrompt: "You are a careful checkout assistant. Never submit before all required fields are filled."
151
+ }
152
+ });
153
+ ```
154
+
155
+ See [docs/EMBEDDING.md](docs/EMBEDDING.md) for the full WebLLM bridge wiring guide.
156
+
157
+ ### Recommended WebLLM models
158
+
159
+ - `Llama-3.2-1B-Instruct-q4f16_1-MLC` — fast, ~600 MB
160
+ - `Llama-3.2-3B-Instruct-q4f16_1-MLC` — better quality, ~1.5 GB
161
+ - `Phi-3.5-mini-instruct-q4f16_1-MLC` — strong quality, ~2 GB
162
+ - `Mistral-7B-Instruct-v0.3-q4f16_1-MLC` — balanced quality, ~4.1 GB
163
+ - `Qwen2.5-7B-Instruct-q4f16_1-MLC` — strongest quality, ~4.3 GB
164
+ - `Llama-3.1-8B-Instruct-q4f16_1-MLC` — strong reasoning, ~4.8 GB
165
+
166
+ Model availability can vary by WebLLM release/build; if one fails to load, use a smaller fallback like `Llama-3.2-1B-Instruct-q4f16_1-MLC`.
92
167
 
93
- | Action | Description |
94
- |------------|------------------------------------------|
95
- | `click` | Click an element by CSS selector |
96
- | `type` | Type text into an input or textarea |
97
- | `navigate` | Navigate to a URL |
98
- | `extract` | Extract text from an element |
99
- | `scroll` | Scroll a container or the page |
100
- | `focus` | Focus an element (useful for dropdowns) |
101
- | `wait` | Pause for a given number of milliseconds |
102
- | `done` | Signal task completion |
168
+ ---
169
+
170
+ ## Agent modes
171
+
172
+ | Mode | Behaviour |
173
+ |---|---|
174
+ | `autonomous` | All `safe` and `review` actions execute without pause |
175
+ | `human-approved` | `review`-rated actions pause and emit `onApprovalRequired` — call `resume()` to continue |
103
176
 
104
- ### AbortSignal support
177
+ ---
178
+
179
+ ## Supported actions
180
+
181
+ | Action | Description | Risk |
182
+ |---|---|---|
183
+ | `navigate` | Navigate to a URL (http/https only) | safe |
184
+ | `click` | Click an element by CSS selector | safe / review |
185
+ | `type` | Type text into an input or textarea | safe / review |
186
+ | `scroll` | Scroll a container or the page | safe |
187
+ | `focus` | Focus an element | safe |
188
+ | `wait` | Pause for N milliseconds | safe |
189
+ | `extract` | Extract text from an element | review |
190
+ | `done` | Signal task completion | safe |
191
+
192
+ ---
193
+
194
+ ## AbortSignal support
105
195
 
106
196
  ```ts
107
197
  const controller = new AbortController();
108
198
  const agent = createBrowserAgent({ goal: "...", signal: controller.signal });
109
199
  agent.start();
110
200
 
111
- // Cancel from outside:
112
- controller.abort();
201
+ controller.abort(); // cancel from outside
113
202
  ```
114
203
 
115
- See full integration guide in `docs/EMBEDDING.md`.
204
+ ---
116
205
 
117
- ## Example site (embedded usage)
206
+ ## Chrome Extension
118
207
 
119
- 1. Build library assets:
208
+ 1. Build:
120
209
 
121
210
  ```bash
122
211
  npm run build
123
212
  ```
124
213
 
125
- 2. Serve the repository root (required for browser ESM import paths):
214
+ 2. Open `chrome://extensions`, enable **Developer Mode**, click **Load unpacked**, select `dist/`.
126
215
 
127
- ```bash
128
- python3 -m http.server 4173
129
- ```
216
+ 3. Open any tab, enter a goal in the popup, pick a mode, and click **Start**.
130
217
 
131
- 3. Open:
218
+ See [docs/DEPLOYMENT.md](docs/DEPLOYMENT.md) for publishing and CI pipeline details.
132
219
 
133
- - `http://localhost:4173/examples/simple-site/`
220
+ ---
134
221
 
135
- The example uses `createBrowserAgent` from `dist/lib.js` and includes UI buttons for start/approve/stop.
136
- It is preconfigured to use `webllm` planner mode and loads `@mlc-ai/web-llm` from CDN in the example page.
222
+ ## Project structure
223
+
224
+ ```
225
+ src/
226
+ ├── background/ Extension service worker — session management
227
+ ├── content/ Extension content script — runs in page context
228
+ ├── core/ Shared engine (planner, observer, executor)
229
+ │ ├── planner.ts
230
+ │ ├── observer.ts
231
+ │ └── executor.ts
232
+ ├── lib/ npm library entry — BrowserAgent class
233
+ │ └── index.ts
234
+ ├── popup/ Extension popup UI
235
+ └── shared/ Types, safety, and parse utilities
236
+ ├── contracts.ts
237
+ ├── safety.ts
238
+ └── parse-action.ts
239
+ ```
240
+
241
+ ---
137
242
 
138
243
  ## Changelog
139
244
 
245
+ ### v0.2.6
246
+
247
+ - Reflection-before-action pattern (`evaluation → memory → next_goal → action`) — agent reasons about each step before acting
248
+ - Working memory carried across ticks for better multi-step goals
249
+ - `parsePlannerResult()` exported from the library
250
+ - `systemPrompt` option in `PlannerConfig` — pass your own prompt without rewriting the bridge
251
+ - Thought bubble (💭) messages in the live demo chat showing the agent's next intent
252
+
253
+ ### v0.2.4 — v0.2.5
254
+
255
+ - CI pipeline: auto version bump on push to main
256
+ - Removed page-agent dependency — reflection pattern implemented natively
257
+ - Chatbot demo redesign: right-aligned user messages, typing indicator, tab navigation (CRM + Task Manager)
258
+ - `parsePlannerResult()` and `PlannerResult` type exported from library
259
+
140
260
  ### v0.2.2
141
261
 
142
- - SDK/extension separation: core logic moved to `src/core/` shared between extension and npm library
262
+ - SDK/extension separation: core logic in `src/core/` shared between extension and npm library
143
263
  - 22 unit tests across planner and safety modules
144
264
  - Action verification in executor (disabled-check, value-verify, empty-check)
145
265
  - `CandidateElement.label` from associated `<label>` elements
146
266
  - Retry loop with `lastError` fed back to planner on failure
147
- - `parseAction` utility exported from the library
148
267
 
149
268
  ### v0.2.0
150
269
 
151
- - **New actions**: `scroll` and `focus`
152
- - **Smarter safety**: risk assessment now checks element label/text rather than CSS selector strings
153
- - **Improved heuristic planner**: handles navigate, fill, click, and search goal patterns with regex matching
154
- - **Better page observation**: filters hidden/invisible elements, includes `placeholder` in candidate data, captures up to 60 candidates
155
- - **Library API**: added `resume()`, `isRunning` and `hasPendingAction` getters, `onMaxStepsReached` event, and `AbortSignal` support
156
- - **Executor**: uses `InputEvent` for proper framework compatibility, added keyboard event dispatch
157
- - **License**: added author name
270
+ - New actions: `scroll` and `focus`
271
+ - Smarter safety: risk assessment checks element label/text
272
+ - Improved heuristic planner with regex pattern matching
273
+ - Better page observation: filters invisible elements, up to 60 candidates
274
+ - Library API: `resume()`, `isRunning`, `hasPendingAction`, `onMaxStepsReached`, `AbortSignal`
158
275
 
159
276
  ### v0.1.0
160
277
 
161
- - Extension runtime loop
162
- - Shared action contracts
163
- - Heuristic + WebLLM planner switch
164
- - Human-approved mode
165
-
166
- ## Planner modes
167
-
168
- | Mode | Description |
169
- |---|---|
170
- | `heuristic` | Zero-dependency regex-based planner. Works offline. Good for simple, predictable goals. |
171
- | `webllm` | Delegates to a local WebLLM bridge on `window.__browserAgentWebLLM`. Fully private, no API calls, runs on-device via WebGPU. |
278
+ - Extension runtime loop, shared action contracts, heuristic + WebLLM planner, human-approved mode
172
279
 
173
- ## Notes
280
+ ---
174
281
 
175
- - Local inference has no API usage charges, but uses device CPU/GPU/memory.
176
- - `webllm` mode expects a bridge implementation attached to `window.__browserAgentWebLLM`. See `docs/EMBEDDING.md` for a complete example.
282
+ ## Docs
177
283
 
178
- ## Roadmap
284
+ - [Embedding Guide](docs/EMBEDDING.md) — integrate into any web app
285
+ - [Architecture](docs/arch.md) — layer-by-layer breakdown
286
+ - [Deployment](docs/DEPLOYMENT.md) — npm publish, Vercel, Chrome extension, CI
287
+ - [Roadmap](docs/ROADMAP.md) — planned features
179
288
 
180
- See [docs/ROADMAP.md](docs/ROADMAP.md).
289
+ ---
181
290
 
182
291
  ## License
183
292
 
@@ -1,14 +1,29 @@
1
1
  // src/background/index.ts
2
2
  var sessions = /* @__PURE__ */ new Map();
3
- function makeSession(tabId, goal, mode, plannerKind) {
3
+ function normalizePlannerConfig(rawPlanner) {
4
+ if (typeof rawPlanner === "string" && (rawPlanner === "heuristic" || rawPlanner === "webllm")) {
5
+ return { kind: rawPlanner };
6
+ }
7
+ if (typeof rawPlanner === "object" && rawPlanner !== null) {
8
+ const record = rawPlanner;
9
+ const kind = record.kind;
10
+ if (kind === "heuristic" || kind === "webllm") {
11
+ return {
12
+ kind,
13
+ modelId: typeof record.modelId === "string" && record.modelId.trim() ? record.modelId : void 0,
14
+ systemPrompt: typeof record.systemPrompt === "string" && record.systemPrompt.trim() ? record.systemPrompt : void 0
15
+ };
16
+ }
17
+ }
18
+ return { kind: "heuristic" };
19
+ }
20
+ function makeSession(tabId, goal, mode, planner) {
4
21
  return {
5
22
  id: crypto.randomUUID(),
6
23
  tabId,
7
24
  goal,
8
25
  mode,
9
- planner: {
10
- kind: plannerKind
11
- },
26
+ planner,
12
27
  history: [],
13
28
  isRunning: true
14
29
  };
@@ -23,6 +38,10 @@ async function tick(tabId) {
23
38
  session
24
39
  });
25
40
  session.history.push(result.message);
41
+ if (result.reflection?.memory !== void 0) {
42
+ session.memory = result.reflection.memory;
43
+ }
44
+ session.lastError = result.status === "error" ? result.message : void 0;
26
45
  if (result.status === "needs_approval") {
27
46
  session.pendingAction = result.action;
28
47
  session.isRunning = false;
@@ -37,7 +56,7 @@ async function tick(tabId) {
37
56
  }
38
57
  chrome.runtime.onMessage.addListener((message, _sender, sendResponse) => {
39
58
  if (message.type === "START_AGENT") {
40
- const session = makeSession(message.tabId, message.goal, message.mode, message.planner);
59
+ const session = makeSession(message.tabId, message.goal, message.mode, normalizePlannerConfig(message.planner));
41
60
  sessions.set(message.tabId, session);
42
61
  tick(message.tabId).catch((error) => {
43
62
  const failed = sessions.get(message.tabId);
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "version": 3,
3
3
  "sources": ["../src/background/index.ts"],
4
- "sourcesContent": ["import type { AgentMode, AgentSession, PlannerKind } from \"../shared/contracts\";\n\nconst sessions = new Map<number, AgentSession>();\n\nfunction makeSession(tabId: number, goal: string, mode: AgentMode, plannerKind: PlannerKind): AgentSession {\n return {\n id: crypto.randomUUID(),\n tabId: tabId,\n goal,\n mode,\n planner: {\n kind: plannerKind\n },\n history: [],\n isRunning: true\n };\n}\n\nasync function tick(tabId: number) {\n const session = sessions.get(tabId);\n if (!session || !session.isRunning) {\n return;\n }\n\n const result = await chrome.tabs.sendMessage(tabId, {\n type: \"AGENT_TICK\",\n session\n });\n\n session.history.push(result.message);\n\n if (result.status === \"needs_approval\") {\n session.pendingAction = result.action;\n session.isRunning = false;\n return;\n }\n\n session.pendingAction = undefined;\n\n if ([\"done\", \"blocked\", \"error\"].includes(result.status)) {\n session.isRunning = false;\n return;\n }\n\n setTimeout(() => tick(tabId), 600);\n}\n\nchrome.runtime.onMessage.addListener((message, _sender, sendResponse) => {\n if (message.type === \"START_AGENT\") {\n const session = makeSession(message.tabId, message.goal, message.mode, message.planner);\n sessions.set(message.tabId, session);\n tick(message.tabId).catch((error) => {\n const failed = sessions.get(message.tabId);\n if (failed) {\n failed.history.push(`Error: ${String(error)}`);\n failed.isRunning = false;\n }\n });\n sendResponse({ ok: true });\n return true;\n }\n\n if (message.type === \"APPROVE_ACTION\") {\n const session = sessions.get(message.tabId);\n if (!session) {\n sendResponse({ ok: false, error: \"No active session\" });\n return true;\n }\n\n session.isRunning = true;\n tick(message.tabId).catch((error) => {\n session.history.push(`Error: ${String(error)}`);\n session.isRunning = false;\n });\n sendResponse({ ok: true });\n return true;\n }\n\n if (message.type === \"STOP_AGENT\") {\n const session = sessions.get(message.tabId);\n if (session) {\n session.isRunning = false;\n }\n chrome.tabs.sendMessage(message.tabId, { type: \"AGENT_STOP\" }).catch(() => undefined);\n sendResponse({ ok: true });\n return true;\n }\n\n if (message.type === \"GET_STATUS\") {\n const lines = Array.from(sessions.values()).map(\n (session) =>\n `${session.isRunning ? \"RUNNING\" : \"IDLE\"} ${session.tabId}: ${session.goal.slice(0, 45)}${session.goal.length > 45 ? \"...\" : \"\"}`\n );\n\n sendResponse({ status: lines.length > 0 ? lines.join(\"\\n\") : \"Idle\" });\n return true;\n }\n\n return false;\n});\n"],
5
- "mappings": ";AAEA,IAAM,WAAW,oBAAI,IAA0B;AAE/C,SAAS,YAAY,OAAe,MAAc,MAAiB,aAAwC;AACzG,SAAO;AAAA,IACL,IAAI,OAAO,WAAW;AAAA,IACtB;AAAA,IACA;AAAA,IACA;AAAA,IACA,SAAS;AAAA,MACP,MAAM;AAAA,IACR;AAAA,IACA,SAAS,CAAC;AAAA,IACV,WAAW;AAAA,EACb;AACF;AAEA,eAAe,KAAK,OAAe;AACjC,QAAM,UAAU,SAAS,IAAI,KAAK;AAClC,MAAI,CAAC,WAAW,CAAC,QAAQ,WAAW;AAClC;AAAA,EACF;AAEA,QAAM,SAAS,MAAM,OAAO,KAAK,YAAY,OAAO;AAAA,IAClD,MAAM;AAAA,IACN;AAAA,EACF,CAAC;AAED,UAAQ,QAAQ,KAAK,OAAO,OAAO;AAEnC,MAAI,OAAO,WAAW,kBAAkB;AACtC,YAAQ,gBAAgB,OAAO;AAC/B,YAAQ,YAAY;AACpB;AAAA,EACF;AAEA,UAAQ,gBAAgB;AAExB,MAAI,CAAC,QAAQ,WAAW,OAAO,EAAE,SAAS,OAAO,MAAM,GAAG;AACxD,YAAQ,YAAY;AACpB;AAAA,EACF;AAEA,aAAW,MAAM,KAAK,KAAK,GAAG,GAAG;AACnC;AAEA,OAAO,QAAQ,UAAU,YAAY,CAAC,SAAS,SAAS,iBAAiB;AACvE,MAAI,QAAQ,SAAS,eAAe;AAClC,UAAM,UAAU,YAAY,QAAQ,OAAO,QAAQ,MAAM,QAAQ,MAAM,QAAQ,OAAO;AACtF,aAAS,IAAI,QAAQ,OAAO,OAAO;AACnC,SAAK,QAAQ,KAAK,EAAE,MAAM,CAAC,UAAU;AACnC,YAAM,SAAS,SAAS,IAAI,QAAQ,KAAK;AACzC,UAAI,QAAQ;AACV,eAAO,QAAQ,KAAK,UAAU,OAAO,KAAK,CAAC,EAAE;AAC7C,eAAO,YAAY;AAAA,MACrB;AAAA,IACF,CAAC;AACD,iBAAa,EAAE,IAAI,KAAK,CAAC;AACzB,WAAO;AAAA,EACT;AAEA,MAAI,QAAQ,SAAS,kBAAkB;AACrC,UAAM,UAAU,SAAS,IAAI,QAAQ,KAAK;AAC1C,QAAI,CAAC,SAAS;AACZ,mBAAa,EAAE,IAAI,OAAO,OAAO,oBAAoB,CAAC;AACtD,aAAO;AAAA,IACT;AAEA,YAAQ,YAAY;AACpB,SAAK,QAAQ,KAAK,EAAE,MAAM,CAAC,UAAU;AACnC,cAAQ,QAAQ,KAAK,UAAU,OAAO,KAAK,CAAC,EAAE;AAC9C,cAAQ,YAAY;AAAA,IACtB,CAAC;AACD,iBAAa,EAAE,IAAI,KAAK,CAAC;AACzB,WAAO;AAAA,EACT;AAEA,MAAI,QAAQ,SAAS,cAAc;AACjC,UAAM,UAAU,SAAS,IAAI,QAAQ,KAAK;AAC1C,QAAI,SAAS;AACX,cAAQ,YAAY;AAAA,IACtB;AACA,WAAO,KAAK,YAAY,QAAQ,OAAO,EAAE,MAAM,aAAa,CAAC,EAAE,MAAM,MAAM,MAAS;AACpF,iBAAa,EAAE,IAAI,KAAK,CAAC;AACzB,WAAO;AAAA,EACT;AAEA,MAAI,QAAQ,SAAS,cAAc;AACjC,UAAM,QAAQ,MAAM,KAAK,SAAS,OAAO,CAAC,EAAE;AAAA,MAC1C,CAAC,YACC,GAAG,QAAQ,YAAY,YAAY,MAAM,IAAI,QAAQ,KAAK,KAAK,QAAQ,KAAK,MAAM,GAAG,EAAE,CAAC,GAAG,QAAQ,KAAK,SAAS,KAAK,QAAQ,EAAE;AAAA,IACpI;AAEA,iBAAa,EAAE,QAAQ,MAAM,SAAS,IAAI,MAAM,KAAK,IAAI,IAAI,OAAO,CAAC;AACrE,WAAO;AAAA,EACT;AAEA,SAAO;AACT,CAAC;",
4
+ "sourcesContent": ["import type { AgentMode, AgentSession, PlannerConfig } from \"../shared/contracts\";\n\nconst sessions = new Map<number, AgentSession>();\n\nfunction normalizePlannerConfig(rawPlanner: unknown): PlannerConfig {\n if (typeof rawPlanner === \"string\" && (rawPlanner === \"heuristic\" || rawPlanner === \"webllm\")) {\n return { kind: rawPlanner };\n }\n\n if (typeof rawPlanner === \"object\" && rawPlanner !== null) {\n const record = rawPlanner as Record<string, unknown>;\n const kind = record.kind;\n if (kind === \"heuristic\" || kind === \"webllm\") {\n return {\n kind,\n modelId: typeof record.modelId === \"string\" && record.modelId.trim() ? record.modelId : undefined,\n systemPrompt: typeof record.systemPrompt === \"string\" && record.systemPrompt.trim() ? record.systemPrompt : undefined\n };\n }\n }\n\n return { kind: \"heuristic\" };\n}\n\nfunction makeSession(tabId: number, goal: string, mode: AgentMode, planner: PlannerConfig): AgentSession {\n return {\n id: crypto.randomUUID(),\n tabId: tabId,\n goal,\n mode,\n planner,\n history: [],\n isRunning: true\n };\n}\n\nasync function tick(tabId: number) {\n const session = sessions.get(tabId);\n if (!session || !session.isRunning) {\n return;\n }\n\n const result = await chrome.tabs.sendMessage(tabId, {\n type: \"AGENT_TICK\",\n session\n });\n\n session.history.push(result.message);\n if (result.reflection?.memory !== undefined) {\n session.memory = result.reflection.memory;\n }\n session.lastError = result.status === \"error\" ? result.message : undefined;\n\n if (result.status === \"needs_approval\") {\n session.pendingAction = result.action;\n session.isRunning = false;\n return;\n }\n\n session.pendingAction = undefined;\n\n if ([\"done\", \"blocked\", \"error\"].includes(result.status)) {\n session.isRunning = false;\n return;\n }\n\n setTimeout(() => tick(tabId), 600);\n}\n\nchrome.runtime.onMessage.addListener((message, _sender, sendResponse) => {\n if (message.type === \"START_AGENT\") {\n const session = makeSession(message.tabId, message.goal, message.mode, normalizePlannerConfig(message.planner));\n sessions.set(message.tabId, session);\n tick(message.tabId).catch((error) => {\n const failed = sessions.get(message.tabId);\n if (failed) {\n failed.history.push(`Error: ${String(error)}`);\n failed.isRunning = false;\n }\n });\n sendResponse({ ok: true });\n return true;\n }\n\n if (message.type === \"APPROVE_ACTION\") {\n const session = sessions.get(message.tabId);\n if (!session) {\n sendResponse({ ok: false, error: \"No active session\" });\n return true;\n }\n\n session.isRunning = true;\n tick(message.tabId).catch((error) => {\n session.history.push(`Error: ${String(error)}`);\n session.isRunning = false;\n });\n sendResponse({ ok: true });\n return true;\n }\n\n if (message.type === \"STOP_AGENT\") {\n const session = sessions.get(message.tabId);\n if (session) {\n session.isRunning = false;\n }\n chrome.tabs.sendMessage(message.tabId, { type: \"AGENT_STOP\" }).catch(() => undefined);\n sendResponse({ ok: true });\n return true;\n }\n\n if (message.type === \"GET_STATUS\") {\n const lines = Array.from(sessions.values()).map(\n (session) =>\n `${session.isRunning ? \"RUNNING\" : \"IDLE\"} ${session.tabId}: ${session.goal.slice(0, 45)}${session.goal.length > 45 ? \"...\" : \"\"}`\n );\n\n sendResponse({ status: lines.length > 0 ? lines.join(\"\\n\") : \"Idle\" });\n return true;\n }\n\n return false;\n});\n"],
5
+ "mappings": ";AAEA,IAAM,WAAW,oBAAI,IAA0B;AAE/C,SAAS,uBAAuB,YAAoC;AAClE,MAAI,OAAO,eAAe,aAAa,eAAe,eAAe,eAAe,WAAW;AAC7F,WAAO,EAAE,MAAM,WAAW;AAAA,EAC5B;AAEA,MAAI,OAAO,eAAe,YAAY,eAAe,MAAM;AACzD,UAAM,SAAS;AACf,UAAM,OAAO,OAAO;AACpB,QAAI,SAAS,eAAe,SAAS,UAAU;AAC7C,aAAO;AAAA,QACL;AAAA,QACA,SAAS,OAAO,OAAO,YAAY,YAAY,OAAO,QAAQ,KAAK,IAAI,OAAO,UAAU;AAAA,QACxF,cAAc,OAAO,OAAO,iBAAiB,YAAY,OAAO,aAAa,KAAK,IAAI,OAAO,eAAe;AAAA,MAC9G;AAAA,IACF;AAAA,EACF;AAEA,SAAO,EAAE,MAAM,YAAY;AAC7B;AAEA,SAAS,YAAY,OAAe,MAAc,MAAiB,SAAsC;AACvG,SAAO;AAAA,IACL,IAAI,OAAO,WAAW;AAAA,IACtB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,SAAS,CAAC;AAAA,IACV,WAAW;AAAA,EACb;AACF;AAEA,eAAe,KAAK,OAAe;AACjC,QAAM,UAAU,SAAS,IAAI,KAAK;AAClC,MAAI,CAAC,WAAW,CAAC,QAAQ,WAAW;AAClC;AAAA,EACF;AAEA,QAAM,SAAS,MAAM,OAAO,KAAK,YAAY,OAAO;AAAA,IAClD,MAAM;AAAA,IACN;AAAA,EACF,CAAC;AAED,UAAQ,QAAQ,KAAK,OAAO,OAAO;AACnC,MAAI,OAAO,YAAY,WAAW,QAAW;AAC3C,YAAQ,SAAS,OAAO,WAAW;AAAA,EACrC;AACA,UAAQ,YAAY,OAAO,WAAW,UAAU,OAAO,UAAU;AAEjE,MAAI,OAAO,WAAW,kBAAkB;AACtC,YAAQ,gBAAgB,OAAO;AAC/B,YAAQ,YAAY;AACpB;AAAA,EACF;AAEA,UAAQ,gBAAgB;AAExB,MAAI,CAAC,QAAQ,WAAW,OAAO,EAAE,SAAS,OAAO,MAAM,GAAG;AACxD,YAAQ,YAAY;AACpB;AAAA,EACF;AAEA,aAAW,MAAM,KAAK,KAAK,GAAG,GAAG;AACnC;AAEA,OAAO,QAAQ,UAAU,YAAY,CAAC,SAAS,SAAS,iBAAiB;AACvE,MAAI,QAAQ,SAAS,eAAe;AAClC,UAAM,UAAU,YAAY,QAAQ,OAAO,QAAQ,MAAM,QAAQ,MAAM,uBAAuB,QAAQ,OAAO,CAAC;AAC9G,aAAS,IAAI,QAAQ,OAAO,OAAO;AACnC,SAAK,QAAQ,KAAK,EAAE,MAAM,CAAC,UAAU;AACnC,YAAM,SAAS,SAAS,IAAI,QAAQ,KAAK;AACzC,UAAI,QAAQ;AACV,eAAO,QAAQ,KAAK,UAAU,OAAO,KAAK,CAAC,EAAE;AAC7C,eAAO,YAAY;AAAA,MACrB;AAAA,IACF,CAAC;AACD,iBAAa,EAAE,IAAI,KAAK,CAAC;AACzB,WAAO;AAAA,EACT;AAEA,MAAI,QAAQ,SAAS,kBAAkB;AACrC,UAAM,UAAU,SAAS,IAAI,QAAQ,KAAK;AAC1C,QAAI,CAAC,SAAS;AACZ,mBAAa,EAAE,IAAI,OAAO,OAAO,oBAAoB,CAAC;AACtD,aAAO;AAAA,IACT;AAEA,YAAQ,YAAY;AACpB,SAAK,QAAQ,KAAK,EAAE,MAAM,CAAC,UAAU;AACnC,cAAQ,QAAQ,KAAK,UAAU,OAAO,KAAK,CAAC,EAAE;AAC9C,cAAQ,YAAY;AAAA,IACtB,CAAC;AACD,iBAAa,EAAE,IAAI,KAAK,CAAC;AACzB,WAAO;AAAA,EACT;AAEA,MAAI,QAAQ,SAAS,cAAc;AACjC,UAAM,UAAU,SAAS,IAAI,QAAQ,KAAK;AAC1C,QAAI,SAAS;AACX,cAAQ,YAAY;AAAA,IACtB;AACA,WAAO,KAAK,YAAY,QAAQ,OAAO,EAAE,MAAM,aAAa,CAAC,EAAE,MAAM,MAAM,MAAS;AACpF,iBAAa,EAAE,IAAI,KAAK,CAAC;AACzB,WAAO;AAAA,EACT;AAEA,MAAI,QAAQ,SAAS,cAAc;AACjC,UAAM,QAAQ,MAAM,KAAK,SAAS,OAAO,CAAC,EAAE;AAAA,MAC1C,CAAC,YACC,GAAG,QAAQ,YAAY,YAAY,MAAM,IAAI,QAAQ,KAAK,KAAK,QAAQ,KAAK,MAAM,GAAG,EAAE,CAAC,GAAG,QAAQ,KAAK,SAAS,KAAK,QAAQ,EAAE;AAAA,IACpI;AAEA,iBAAa,EAAE,QAAQ,MAAM,SAAS,IAAI,MAAM,KAAK,IAAI,IAAI,OAAO,CAAC;AACrE,WAAO;AAAA,EACT;AAEA,SAAO;AACT,CAAC;",
6
6
  "names": []
7
7
  }
package/dist/content.js CHANGED
@@ -1,3 +1,86 @@
1
+ var __defProp = Object.defineProperty;
2
+ var __getOwnPropNames = Object.getOwnPropertyNames;
3
+ var __esm = (fn, res) => function __init() {
4
+ return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
5
+ };
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+
11
+ // src/shared/parse-action.ts
12
+ var parse_action_exports = {};
13
+ __export(parse_action_exports, {
14
+ parseAction: () => parseAction,
15
+ parsePlannerResult: () => parsePlannerResult
16
+ });
17
+ function parseAction(raw) {
18
+ const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)```/);
19
+ const candidate = fenceMatch ? fenceMatch[1].trim() : raw.trim();
20
+ const objectMatch = candidate.match(/\{[\s\S]*\}/);
21
+ if (!objectMatch) {
22
+ return { type: "done", reason: `No JSON object found in: ${raw.slice(0, 120)}` };
23
+ }
24
+ let parsed;
25
+ try {
26
+ parsed = JSON.parse(objectMatch[0]);
27
+ } catch {
28
+ return { type: "done", reason: `JSON parse error for: ${objectMatch[0].slice(0, 120)}` };
29
+ }
30
+ if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
31
+ return { type: "done", reason: "Parsed value is not an object" };
32
+ }
33
+ const obj = parsed;
34
+ if (typeof obj.type !== "string" || !VALID_TYPES.has(obj.type)) {
35
+ return { type: "done", reason: `Unknown or missing action type: ${String(obj.type)}` };
36
+ }
37
+ return obj;
38
+ }
39
+ function parsePlannerResult(raw) {
40
+ const fenceMatch = raw.match(/```(?:json)?\s*([\s\S]*?)```/);
41
+ const candidate = fenceMatch ? fenceMatch[1].trim() : raw.trim();
42
+ const objectMatch = candidate.match(/\{[\s\S]*\}/);
43
+ if (!objectMatch) {
44
+ return { action: { type: "done", reason: `No JSON found in: ${raw.slice(0, 120)}` } };
45
+ }
46
+ let parsed;
47
+ try {
48
+ parsed = JSON.parse(objectMatch[0]);
49
+ } catch {
50
+ return { action: { type: "done", reason: `JSON parse error: ${objectMatch[0].slice(0, 120)}` } };
51
+ }
52
+ if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
53
+ return { action: { type: "done", reason: "Parsed value is not an object" } };
54
+ }
55
+ const obj = parsed;
56
+ if (typeof obj.action === "object" && obj.action !== null) {
57
+ const action = parseAction(JSON.stringify(obj.action));
58
+ return {
59
+ action,
60
+ evaluation: typeof obj.evaluation === "string" ? obj.evaluation : void 0,
61
+ memory: typeof obj.memory === "string" ? obj.memory : void 0,
62
+ nextGoal: typeof obj.nextGoal === "string" ? obj.nextGoal : typeof obj.next_goal === "string" ? obj.next_goal : void 0
63
+ };
64
+ }
65
+ return { action: parseAction(objectMatch[0]) };
66
+ }
67
+ var VALID_TYPES;
68
+ var init_parse_action = __esm({
69
+ "src/shared/parse-action.ts"() {
70
+ "use strict";
71
+ VALID_TYPES = /* @__PURE__ */ new Set([
72
+ "click",
73
+ "type",
74
+ "navigate",
75
+ "extract",
76
+ "scroll",
77
+ "focus",
78
+ "wait",
79
+ "done"
80
+ ]);
81
+ }
82
+ });
83
+
1
84
  // src/shared/safety.ts
2
85
  var RISKY_KEYWORDS = /\b(delete|remove|pay|purchase|submit|confirm|checkout|transfer|withdraw|send)\b/i;
3
86
  function elementTextRisky(text) {
@@ -253,6 +336,18 @@ function toPlannerResult(raw) {
253
336
  }
254
337
  return { action: raw };
255
338
  }
339
+ async function parsePlannerText(raw) {
340
+ const parser = await Promise.resolve().then(() => (init_parse_action(), parse_action_exports));
341
+ return parser.parsePlannerResult(raw);
342
+ }
343
+ async function normalizeBridgeResponse(raw) {
344
+ if (typeof raw === "string") {
345
+ const parsed = await parsePlannerText(raw);
346
+ const parseFailed = parsed.action.type === "done" && /(No JSON|JSON parse error|Parsed value is not an object|Unknown or missing action type)/.test(parsed.action.reason);
347
+ return { result: parsed, parseFailed, rawText: raw };
348
+ }
349
+ return { result: toPlannerResult(raw), parseFailed: false };
350
+ }
256
351
  async function planNextAction(config, input) {
257
352
  if (config.kind === "heuristic") {
258
353
  return { action: heuristicPlan(input) };
@@ -266,8 +361,25 @@ async function planNextAction(config, input) {
266
361
  }
267
362
  };
268
363
  }
269
- const raw = await bridge.plan(input, config.modelId);
270
- return toPlannerResult(raw);
364
+ const plannerInput = { ...input, systemPrompt: config.systemPrompt };
365
+ const firstAttempt = await normalizeBridgeResponse(await bridge.plan(plannerInput, config.modelId));
366
+ if (!firstAttempt.parseFailed) {
367
+ return firstAttempt.result;
368
+ }
369
+ if (bridge.retryInvalidJson && firstAttempt.rawText) {
370
+ const retryAttempt = await normalizeBridgeResponse(
371
+ await bridge.retryInvalidJson(plannerInput, firstAttempt.rawText, config.modelId)
372
+ );
373
+ if (!retryAttempt.parseFailed) {
374
+ return retryAttempt.result;
375
+ }
376
+ }
377
+ return {
378
+ action: {
379
+ type: "done",
380
+ reason: "WebLLM output could not be parsed after retry."
381
+ }
382
+ };
271
383
  }
272
384
 
273
385
  // src/content/index.ts
@@ -293,8 +405,12 @@ async function runTick(session) {
293
405
  if (action.type === "done") {
294
406
  return { status: "done", action, message: action.reason, reflection };
295
407
  }
296
- const message = await executeAction(action);
297
- return { status: "executed", action, message, reflection };
408
+ try {
409
+ const message = await executeAction(action);
410
+ return { status: "executed", action, message, reflection };
411
+ } catch (error) {
412
+ return { status: "error", action, message: String(error), reflection };
413
+ }
298
414
  }
299
415
  async function executePendingAction(session) {
300
416
  if (!session.pendingAction) {