@akshayram1/omnibrowser-agent 0.2.8 → 0.2.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/popup.html CHANGED
@@ -7,9 +7,10 @@
7
7
  <style>
8
8
  body { font-family: system-ui, sans-serif; margin: 12px; width: 340px; }
9
9
  h1 { font-size: 16px; margin: 0 0 8px; }
10
- textarea, select, button { width: 100%; margin-top: 8px; }
10
+ textarea, select, input, button { width: 100%; margin-top: 8px; }
11
11
  textarea { min-height: 86px; }
12
12
  .row { display: grid; grid-template-columns: 1fr 1fr; gap: 8px; }
13
+ .hidden { display: none; }
13
14
  #status { margin-top: 10px; font-size: 12px; color: #444; white-space: pre-wrap; }
14
15
  </style>
15
16
  </head>
@@ -35,6 +36,11 @@
35
36
  </div>
36
37
  </div>
37
38
 
39
+ <div id="model-row" class="hidden">
40
+ <label for="modelId">WebLLM Model ID</label>
41
+ <input id="modelId" type="text" placeholder="Llama-3.2-1B-Instruct-q4f16_1-MLC" />
42
+ </div>
43
+
38
44
  <button id="start">Start</button>
39
45
  <button id="approve">Approve pending action</button>
40
46
  <button id="stop">Stop</button>
package/dist/popup.js CHANGED
@@ -2,6 +2,8 @@
2
2
  var goal = document.getElementById("goal");
3
3
  var mode = document.getElementById("mode");
4
4
  var planner = document.getElementById("planner");
5
+ var modelRow = document.getElementById("model-row");
6
+ var modelId = document.getElementById("modelId");
5
7
  var status = document.getElementById("status");
6
8
  var start = document.getElementById("start");
7
9
  var approve = document.getElementById("approve");
@@ -13,16 +15,32 @@ async function withActiveTab(fn) {
13
15
  }
14
16
  return fn(tab.id);
15
17
  }
18
+ function syncModelInputVisibility() {
19
+ const isWebLLM = planner.value === "webllm";
20
+ modelRow.classList.toggle("hidden", !isWebLLM);
21
+ }
22
+ planner.addEventListener("change", syncModelInputVisibility);
23
+ syncModelInputVisibility();
16
24
  start.addEventListener("click", async () => {
17
25
  try {
18
26
  status.textContent = "Starting...";
27
+ const plannerKind = planner.value;
28
+ const plannerConfig = {
29
+ kind: plannerKind
30
+ };
31
+ if (plannerKind === "webllm") {
32
+ const trimmedModelId = modelId.value.trim();
33
+ if (trimmedModelId) {
34
+ plannerConfig.modelId = trimmedModelId;
35
+ }
36
+ }
19
37
  await withActiveTab(
20
38
  (tabId) => chrome.runtime.sendMessage({
21
39
  type: "START_AGENT",
22
40
  tabId,
23
41
  goal: goal.value.trim(),
24
42
  mode: mode.value,
25
- planner: planner.value
43
+ planner: plannerConfig
26
44
  })
27
45
  );
28
46
  status.textContent = "Agent started";
package/dist/popup.js.map CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "version": 3,
3
3
  "sources": ["../src/popup/index.ts"],
4
- "sourcesContent": ["import type { AgentMode, PlannerKind } from \"../shared/contracts\";\n\nconst goal = document.getElementById(\"goal\") as HTMLTextAreaElement;\nconst mode = document.getElementById(\"mode\") as HTMLSelectElement;\nconst planner = document.getElementById(\"planner\") as HTMLSelectElement;\nconst status = document.getElementById(\"status\") as HTMLDivElement;\n\nconst start = document.getElementById(\"start\") as HTMLButtonElement;\nconst approve = document.getElementById(\"approve\") as HTMLButtonElement;\nconst stop = document.getElementById(\"stop\") as HTMLButtonElement;\n\nasync function withActiveTab<T>(fn: (tabId: number) => Promise<T>) {\n const [tab] = await chrome.tabs.query({ active: true, currentWindow: true });\n if (!tab?.id) {\n throw new Error(\"No active tab found\");\n }\n return fn(tab.id);\n}\n\nstart.addEventListener(\"click\", async () => {\n try {\n status.textContent = \"Starting...\";\n await withActiveTab((tabId) =>\n chrome.runtime.sendMessage({\n type: \"START_AGENT\",\n tabId,\n goal: goal.value.trim(),\n mode: mode.value as AgentMode,\n planner: planner.value as PlannerKind\n })\n );\n status.textContent = \"Agent started\";\n } catch (error) {\n status.textContent = `Error: ${String(error)}`;\n }\n});\n\napprove.addEventListener(\"click\", async () => {\n await withActiveTab((tabId) => chrome.runtime.sendMessage({ type: \"APPROVE_ACTION\", tabId }));\n status.textContent = \"Approved pending action\";\n});\n\nstop.addEventListener(\"click\", async () => {\n await withActiveTab((tabId) => chrome.runtime.sendMessage({ type: \"STOP_AGENT\", tabId }));\n status.textContent = \"Stopped\";\n});\n\nchrome.runtime.sendMessage({ type: \"GET_STATUS\" }, (resp) => {\n if (resp?.status) {\n status.textContent = resp.status;\n }\n});\n"],
5
- "mappings": ";AAEA,IAAM,OAAO,SAAS,eAAe,MAAM;AAC3C,IAAM,OAAO,SAAS,eAAe,MAAM;AAC3C,IAAM,UAAU,SAAS,eAAe,SAAS;AACjD,IAAM,SAAS,SAAS,eAAe,QAAQ;AAE/C,IAAM,QAAQ,SAAS,eAAe,OAAO;AAC7C,IAAM,UAAU,SAAS,eAAe,SAAS;AACjD,IAAM,OAAO,SAAS,eAAe,MAAM;AAE3C,eAAe,cAAiB,IAAmC;AACjE,QAAM,CAAC,GAAG,IAAI,MAAM,OAAO,KAAK,MAAM,EAAE,QAAQ,MAAM,eAAe,KAAK,CAAC;AAC3E,MAAI,CAAC,KAAK,IAAI;AACZ,UAAM,IAAI,MAAM,qBAAqB;AAAA,EACvC;AACA,SAAO,GAAG,IAAI,EAAE;AAClB;AAEA,MAAM,iBAAiB,SAAS,YAAY;AAC1C,MAAI;AACF,WAAO,cAAc;AACrB,UAAM;AAAA,MAAc,CAAC,UACnB,OAAO,QAAQ,YAAY;AAAA,QACzB,MAAM;AAAA,QACN;AAAA,QACA,MAAM,KAAK,MAAM,KAAK;AAAA,QACtB,MAAM,KAAK;AAAA,QACX,SAAS,QAAQ;AAAA,MACnB,CAAC;AAAA,IACH;AACA,WAAO,cAAc;AAAA,EACvB,SAAS,OAAO;AACd,WAAO,cAAc,UAAU,OAAO,KAAK,CAAC;AAAA,EAC9C;AACF,CAAC;AAED,QAAQ,iBAAiB,SAAS,YAAY;AAC5C,QAAM,cAAc,CAAC,UAAU,OAAO,QAAQ,YAAY,EAAE,MAAM,kBAAkB,MAAM,CAAC,CAAC;AAC5F,SAAO,cAAc;AACvB,CAAC;AAED,KAAK,iBAAiB,SAAS,YAAY;AACzC,QAAM,cAAc,CAAC,UAAU,OAAO,QAAQ,YAAY,EAAE,MAAM,cAAc,MAAM,CAAC,CAAC;AACxF,SAAO,cAAc;AACvB,CAAC;AAED,OAAO,QAAQ,YAAY,EAAE,MAAM,aAAa,GAAG,CAAC,SAAS;AAC3D,MAAI,MAAM,QAAQ;AAChB,WAAO,cAAc,KAAK;AAAA,EAC5B;AACF,CAAC;",
4
+ "sourcesContent": ["import type { AgentMode, PlannerConfig, PlannerKind } from \"../shared/contracts\";\n\nconst goal = document.getElementById(\"goal\") as HTMLTextAreaElement;\nconst mode = document.getElementById(\"mode\") as HTMLSelectElement;\nconst planner = document.getElementById(\"planner\") as HTMLSelectElement;\nconst modelRow = document.getElementById(\"model-row\") as HTMLDivElement;\nconst modelId = document.getElementById(\"modelId\") as HTMLInputElement;\nconst status = document.getElementById(\"status\") as HTMLDivElement;\n\nconst start = document.getElementById(\"start\") as HTMLButtonElement;\nconst approve = document.getElementById(\"approve\") as HTMLButtonElement;\nconst stop = document.getElementById(\"stop\") as HTMLButtonElement;\n\nasync function withActiveTab<T>(fn: (tabId: number) => Promise<T>) {\n const [tab] = await chrome.tabs.query({ active: true, currentWindow: true });\n if (!tab?.id) {\n throw new Error(\"No active tab found\");\n }\n return fn(tab.id);\n}\n\nfunction syncModelInputVisibility() {\n const isWebLLM = planner.value === \"webllm\";\n modelRow.classList.toggle(\"hidden\", !isWebLLM);\n}\n\nplanner.addEventListener(\"change\", syncModelInputVisibility);\nsyncModelInputVisibility();\n\nstart.addEventListener(\"click\", async () => {\n try {\n status.textContent = \"Starting...\";\n const plannerKind = planner.value as PlannerKind;\n const plannerConfig: PlannerConfig = {\n kind: plannerKind\n };\n\n if (plannerKind === \"webllm\") {\n const trimmedModelId = modelId.value.trim();\n if (trimmedModelId) {\n plannerConfig.modelId = trimmedModelId;\n }\n }\n\n await withActiveTab((tabId) =>\n chrome.runtime.sendMessage({\n type: \"START_AGENT\",\n tabId,\n goal: goal.value.trim(),\n mode: mode.value as AgentMode,\n planner: plannerConfig\n })\n );\n status.textContent = \"Agent started\";\n } catch (error) {\n status.textContent = `Error: ${String(error)}`;\n }\n});\n\napprove.addEventListener(\"click\", async () => {\n await withActiveTab((tabId) => chrome.runtime.sendMessage({ type: \"APPROVE_ACTION\", tabId }));\n status.textContent = \"Approved pending action\";\n});\n\nstop.addEventListener(\"click\", async () => {\n await withActiveTab((tabId) => chrome.runtime.sendMessage({ type: \"STOP_AGENT\", tabId }));\n status.textContent = \"Stopped\";\n});\n\nchrome.runtime.sendMessage({ type: \"GET_STATUS\" }, (resp) => {\n if (resp?.status) {\n status.textContent = resp.status;\n }\n});\n"],
5
+ "mappings": ";AAEA,IAAM,OAAO,SAAS,eAAe,MAAM;AAC3C,IAAM,OAAO,SAAS,eAAe,MAAM;AAC3C,IAAM,UAAU,SAAS,eAAe,SAAS;AACjD,IAAM,WAAW,SAAS,eAAe,WAAW;AACpD,IAAM,UAAU,SAAS,eAAe,SAAS;AACjD,IAAM,SAAS,SAAS,eAAe,QAAQ;AAE/C,IAAM,QAAQ,SAAS,eAAe,OAAO;AAC7C,IAAM,UAAU,SAAS,eAAe,SAAS;AACjD,IAAM,OAAO,SAAS,eAAe,MAAM;AAE3C,eAAe,cAAiB,IAAmC;AACjE,QAAM,CAAC,GAAG,IAAI,MAAM,OAAO,KAAK,MAAM,EAAE,QAAQ,MAAM,eAAe,KAAK,CAAC;AAC3E,MAAI,CAAC,KAAK,IAAI;AACZ,UAAM,IAAI,MAAM,qBAAqB;AAAA,EACvC;AACA,SAAO,GAAG,IAAI,EAAE;AAClB;AAEA,SAAS,2BAA2B;AAClC,QAAM,WAAW,QAAQ,UAAU;AACnC,WAAS,UAAU,OAAO,UAAU,CAAC,QAAQ;AAC/C;AAEA,QAAQ,iBAAiB,UAAU,wBAAwB;AAC3D,yBAAyB;AAEzB,MAAM,iBAAiB,SAAS,YAAY;AAC1C,MAAI;AACF,WAAO,cAAc;AACrB,UAAM,cAAc,QAAQ;AAC5B,UAAM,gBAA+B;AAAA,MACnC,MAAM;AAAA,IACR;AAEA,QAAI,gBAAgB,UAAU;AAC5B,YAAM,iBAAiB,QAAQ,MAAM,KAAK;AAC1C,UAAI,gBAAgB;AAClB,sBAAc,UAAU;AAAA,MAC1B;AAAA,IACF;AAEA,UAAM;AAAA,MAAc,CAAC,UACnB,OAAO,QAAQ,YAAY;AAAA,QACzB,MAAM;AAAA,QACN;AAAA,QACA,MAAM,KAAK,MAAM,KAAK;AAAA,QACtB,MAAM,KAAK;AAAA,QACX,SAAS;AAAA,MACX,CAAC;AAAA,IACH;AACA,WAAO,cAAc;AAAA,EACvB,SAAS,OAAO;AACd,WAAO,cAAc,UAAU,OAAO,KAAK,CAAC;AAAA,EAC9C;AACF,CAAC;AAED,QAAQ,iBAAiB,SAAS,YAAY;AAC5C,QAAM,cAAc,CAAC,UAAU,OAAO,QAAQ,YAAY,EAAE,MAAM,kBAAkB,MAAM,CAAC,CAAC;AAC5F,SAAO,cAAc;AACvB,CAAC;AAED,KAAK,iBAAiB,SAAS,YAAY;AACzC,QAAM,cAAc,CAAC,UAAU,OAAO,QAAQ,YAAY,EAAE,MAAM,cAAc,MAAM,CAAC,CAAC;AACxF,SAAO,cAAc;AACvB,CAAC;AAED,OAAO,QAAQ,YAAY,EAAE,MAAM,aAAa,GAAG,CAAC,SAAS;AAC3D,MAAI,MAAM,QAAQ;AAChB,WAAO,cAAc,KAAK;AAAA,EAC5B;AACF,CAAC;",
6
6
  "names": []
7
7
  }
@@ -0,0 +1,3 @@
1
+ import type { PlannerInput } from "../shared/contracts";
2
+ export declare function buildSystemPrompt(customPrompt?: string): string;
3
+ export declare function buildUserMessage(input: PlannerInput): string;
@@ -0,0 +1,33 @@
1
+ import type { PlannerInput, PlannerResult } from "../shared/contracts";
2
+ export declare const INVALID_JSON_RETRY_MESSAGE = "Invalid JSON. Reply with only a valid JSON object.";
3
+ type ChatRole = "system" | "user" | "assistant";
4
+ type ChatMessage = {
5
+ role: ChatRole;
6
+ content: string;
7
+ };
8
+ type CompletionRequest = {
9
+ messages: ChatMessage[];
10
+ temperature?: number;
11
+ max_tokens?: number;
12
+ model?: string;
13
+ };
14
+ type CompletionResponse = {
15
+ choices?: Array<{
16
+ message?: {
17
+ content?: unknown;
18
+ };
19
+ }>;
20
+ };
21
+ export type WebLLMEngineLike = {
22
+ chat: {
23
+ completions: {
24
+ create(request: CompletionRequest): Promise<CompletionResponse>;
25
+ };
26
+ };
27
+ };
28
+ export type BrowserAgentWebLLMBridge = {
29
+ plan(input: PlannerInput, modelId?: string): Promise<PlannerResult>;
30
+ retryInvalidJson(input: PlannerInput, badOutput: string, modelId?: string): Promise<PlannerResult>;
31
+ };
32
+ export declare function createWebLLMBridge(engine: WebLLMEngineLike): BrowserAgentWebLLMBridge;
33
+ export {};
@@ -20,5 +20,7 @@ export declare class BrowserAgent {
20
20
  private delay;
21
21
  }
22
22
  export declare function createBrowserAgent(config: LibraryAgentConfig, events?: LibraryAgentEvents): BrowserAgent;
23
+ export { createWebLLMBridge } from "../core/webllm-bridge";
23
24
  export { parseAction, parsePlannerResult } from "../shared/parse-action";
25
+ export type { BrowserAgentWebLLMBridge, WebLLMEngineLike } from "../core/webllm-bridge";
24
26
  export type { AgentAction, AgentMode, AgentSession, ContentResult, LibraryAgentConfig, LibraryAgentEvents, PlannerConfig, PlannerInput, PlannerKind, PlannerResult, RiskLevel } from "../shared/contracts";
@@ -10,7 +10,8 @@ export declare function parseAction(raw: string): AgentAction;
10
10
  * Parse a full PlannerResult from raw LLM output.
11
11
  *
12
12
  * Accepts the reflection+action format:
13
- * { "evaluation": "...", "memory": "...", "next_goal": "...", "action": { ... } }
13
+ * { "evaluation": "...", "memory": "...", "nextGoal": "...", "action": { ... } }
14
+ * Also supports legacy `next_goal` key for backward compatibility.
14
15
  *
15
16
  * Also accepts a bare AgentAction for backward compatibility with simple bridges.
16
17
  */
@@ -61,4 +61,4 @@ window.__browserAgentWebLLM = {
61
61
  - No persistent long-term memory yet
62
62
  - No task DSL/skills registry yet
63
63
  - Risk scoring is simple keyword heuristic
64
- - No robust selector healing yet
64
+ - Selector healing is basic (attribute fallback + single-element shortcut)
package/docs/EMBEDDING.md CHANGED
@@ -53,22 +53,11 @@ To use planner mode `webllm`, load the WebLLM engine and wire the bridge before
53
53
 
54
54
  ```ts
55
55
  import * as webllm from "@mlc-ai/web-llm";
56
+ import { createBrowserAgent, createWebLLMBridge } from "@akshayram1/omnibrowser-agent";
56
57
 
57
58
  const engine = await webllm.CreateMLCEngine("Llama-3.2-1B-Instruct-q4f16_1-MLC");
58
59
 
59
- window.__browserAgentWebLLM = {
60
- async plan(input, modelId) {
61
- const resp = await engine.chat.completions.create({
62
- messages: [
63
- { role: "system", content: "Output only a JSON AgentAction object." },
64
- { role: "user", content: `Goal: ${input.goal}\nHistory: ${input.history.join(", ")}` }
65
- ],
66
- temperature: 0,
67
- max_tokens: 100
68
- });
69
- return JSON.parse(resp.choices[0].message.content);
70
- }
71
- };
60
+ window.__browserAgentWebLLM = createWebLLMBridge(engine);
72
61
 
73
62
  const agent = createBrowserAgent({
74
63
  goal: "Fill the contact form",
@@ -82,4 +71,4 @@ await agent.start();
82
71
 
83
72
  - For production, mount this inside an authenticated app shell and add your own permission checks.
84
73
  - `human-approved` mode is recommended for CRM/finance/admin actions.
85
- - The WebLLM bridge is not bundled — bring your own engine instance and wire it to `window.__browserAgentWebLLM`.
74
+ - Bring your own WebLLM engine instance, then wire `createWebLLMBridge(engine)` to `window.__browserAgentWebLLM`.
package/docs/ROADMAP.md CHANGED
@@ -16,19 +16,14 @@
16
16
 
17
17
  ## v0.3
18
18
 
19
- - Site profile + policy engine (allowlist, blocked domains)
20
- - Selector healing and fallback strategy
21
- - Session memory and action replay log
22
- - Drupal CRM starter skills
23
-
24
- ## v0.3
25
-
26
- - Long-term encrypted memory in IndexedDB
27
- - Goal decomposition planner (multi-step task graphs)
28
- - Multi-tab workflows
19
+ - Expanded WebLLM model catalog (new 7B/8B options + compatibility matrix)
20
+ - Improved model loading UX (recommended presets by speed/quality and device memory)
21
+ - Enhanced default system prompts for safer, clearer multi-step planning
22
+ - Prompt presets for common workflows (docs navigation, CRM form fill, task automation)
29
23
 
30
24
  ## v1.0
31
25
 
32
- - Stable plugin API for site skills
33
- - Validation/eval harness with benchmark tasks
34
- - Cross-browser packaging (Chromium + Firefox)
26
+ - Advanced prompt orchestration (goal-aware system prompt routing and contextual guardrails)
27
+ - Functionality expansion: richer action toolkit and stronger extraction/navigation reliability
28
+ - Adaptive planner behaviour (model-aware retries, fallback strategies, and recovery flows)
29
+ - Evaluation suite for prompt and model quality across benchmark browser tasks
package/docs/arch.md CHANGED
@@ -90,7 +90,7 @@ These three modules are **shared** between the extension content script and the
90
90
  | `executor.ts` | Performs DOM actions and returns a result string |
91
91
 
92
92
  **`observer.ts` — `collectSnapshot()`**
93
- Queries all interactive elements (`a`, `button`, `input`, `textarea`, `select`, `[role=button]`, `[contenteditable]`), filters out invisible ones (hidden, `display:none`, zero dimensions), and prioritises in-viewport elements. Resolves accessible labels via `aria-labelledby`, `aria-label`, `for/id`, and wrapping `<label>`. Caps at 60 candidates. Returns `url`, `title`, `textPreview`, and `candidates[]`.
93
+ Queries all interactive elements (`a`, `button`, `input`, `textarea`, `select`, `[role=button]`, `[contenteditable]`), filters out invisible ones (hidden, `display:none`, zero dimensions), and prioritises in-viewport elements. Resolves accessible labels via `aria-labelledby`, `aria-label`, `for/id`, and wrapping `<label>`. Generates stable CSS selectors preferring `name`, `placeholder`, and `aria-label` attributes over fragile `:nth-of-type()` indices. Caps at 60 candidates. Returns `url`, `title`, `textPreview`, and `candidates[]`.
94
94
 
95
95
  **`planner.ts` — `planNextAction()`**
96
96
  Two modes:
@@ -98,7 +98,7 @@ Two modes:
98
98
  - *WebLLM* — delegates to `window.__browserAgentWebLLM.plan()`. The bridge is external — you wire it in. Accepts both legacy `AgentAction` returns and the new `PlannerResult` (with `evaluation`, `memory`, `nextGoal` reflection fields).
99
99
 
100
100
  **`executor.ts` — `executeAction()`**
101
- Performs the action. Uses `InputEvent` with `bubbles: true` so React/Vue controlled inputs receive proper framework events. Verifies: element exists, is not disabled (for clicks), value updated (for type), extracted text is non-empty. Throws on failure so the retry loop can feed `lastError` back to the planner.
101
+ Performs the action. Uses `InputEvent` with `bubbles: true` so React/Vue controlled inputs receive proper framework events. Verifies: element exists, is not disabled (for clicks), value updated (for type), extracted text is non-empty. Includes selector fallback: when a selector fails, tries to recover via tag+attribute matching or single-element shortcut before throwing. Throws on failure so the retry loop can feed `lastError` back to the planner.
102
102
 
103
103
  ---
104
104