hanzi-browse 2.3.1 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,45 +1,51 @@
1
- # Hanzi Browse — MCP Server
1
+ # Hanzi Browse
2
2
 
3
- The MCP server exposes browser tools to MCP clients and forwards browser work to
4
- the Chrome extension over the local WebSocket relay.
3
+ Give your AI agent a real browser with your existing logins, cookies, and sessions.
5
4
 
6
- ## Setup
5
+ **Two ways to use it:**
6
+ - **Use locally** — MCP server for Claude Code, Cursor, Codex, and other AI coding agents
7
+ - **Build with it** — REST API + TypeScript SDK for embedding browser automation in your product
8
+
9
+ ## Quick Start (MCP)
7
10
 
8
11
  ```bash
9
- cd mcp-server
10
- npm install
11
- npm run build
12
+ npx hanzi-browse setup
12
13
  ```
13
14
 
14
- Add to your MCP config (e.g., `~/.claude/claude_desktop_config.json`):
15
+ This installs the Chrome extension and configures your AI agent. One command, done.
15
16
 
16
- ```json
17
- {
18
- "mcpServers": {
19
- "browser": {
20
- "command": "node",
21
- "args": ["/path/to/hanzi-browse/mcp-server/dist/index.js"]
22
- }
23
- }
24
- }
17
+ **Prerequisites:** Chrome must be open with the [Hanzi extension](https://chromewebstore.google.com/detail/hanzi-browse/iklpkemlmbhemkiojndpbhoakgikpmcd) installed.
18
+
19
+ ## Quick Start (API)
20
+
21
+ ```bash
22
+ npm install @hanzi-browse/sdk
25
23
  ```
26
24
 
27
- **Prerequisites:** The Chrome extension must be installed and running. See the [main README](../README.md) for full setup.
25
+ ```typescript
26
+ import { HanziClient } from '@hanzi-browse/sdk';
28
27
 
29
- ## How It Works
28
+ const client = new HanziClient({ apiKey: 'hic_live_...' });
30
29
 
31
- ```text
32
- MCP client
33
- -> mcp-server (stdio)
34
- -> relay (WebSocket)
35
- -> Chrome extension
36
- -> browser agent
30
+ // 1. Pair a browser — give the URL to your user
31
+ const { pairingToken } = await client.createPairingToken();
32
+ // User visits: https://api.hanzilla.co/pair/{pairingToken}
33
+
34
+ // 2. Find their connected session
35
+ const sessions = await client.listSessions();
36
+ const browser = sessions.find(s => s.status === 'connected');
37
+
38
+ // 3. Run a task (polls until complete)
39
+ const result = await client.runTask({
40
+ browserSessionId: browser.id,
41
+ task: 'Go to example.com and read the page title',
42
+ });
43
+ console.log(result.answer);
37
44
  ```
38
45
 
39
- The extension is the browser executor. The MCP server should only manage MCP
40
- tool calls, local session bookkeeping, and blocking waits for completion.
46
+ Full API docs: [browse.hanzilla.co/docs.html](https://browse.hanzilla.co/docs.html)
41
47
 
42
- ## Tools
48
+ ## MCP Tools
43
49
 
44
50
  ### `browser_start`
45
51
 
@@ -55,7 +61,6 @@ browser_start(
55
61
  → {
56
62
  "session_id": "abc123",
57
63
  "status": "complete",
58
- "task": "Search for flights to Tokyo...",
59
64
  "answer": "Found 3 flights: JAL $850, ANA $920, United $780",
60
65
  "total_steps": 8,
61
66
  "recent_steps": ["Opened Google Flights", "Set destination to Tokyo", ...]
@@ -64,7 +69,7 @@ browser_start(
64
69
 
65
70
  ### `browser_message`
66
71
 
67
- Send follow-up instructions to an existing session. Also blocks until the agent finishes.
72
+ Send follow-up instructions to an existing session.
68
73
 
69
74
  ```
70
75
  browser_message(session_id: "abc123", message: "Book the cheapest one")
@@ -75,7 +80,7 @@ browser_message(session_id: "abc123", message: "Book the cheapest one")
75
80
  Check known sessions and their latest status.
76
81
 
77
82
  ```
78
- browser_status() // all active sessions
83
+ browser_status() // all active sessions
79
84
  browser_status(session_id: "abc123") // specific session
80
85
  ```
81
86
 
@@ -85,7 +90,7 @@ Stop a task.
85
90
 
86
91
  ```
87
92
  browser_stop(session_id: "abc123")
88
- browser_stop(session_id: "abc123", remove: true) // also delete session
93
+ browser_stop(session_id: "abc123", remove: true) // also close window
89
94
  ```
90
95
 
91
96
  ### `browser_screenshot`
@@ -98,84 +103,65 @@ browser_screenshot(session_id: "abc123")
98
103
 
99
104
  ## Examples
100
105
 
101
- **Research:**
102
- ```
103
- browser_start("Find the top 3 competitors for Acme Corp and summarize their pricing")
104
- ```
105
-
106
106
  **Logged-in workflows:**
107
107
  ```
108
- browser_start("Go to Jira, find my open tickets, and summarize what needs attention this week")
108
+ browser_start("Go to Jira, find my open tickets, and summarize what needs attention")
109
109
  ```
110
110
 
111
111
  **Multi-turn:**
112
112
  ```
113
113
  s = browser_start("Go to LinkedIn and find AI Engineer jobs in Montreal")
114
- → { session_id: "x1", answer: "Found: Applied AI Engineer at Cohere" }
115
-
116
- browser_message("x1", "Click into that job and tell me the requirements")
117
- → { answer: "Requirements: 3+ years Python, ML experience..." }
118
-
119
- browser_message("x1", "Apply to this job using my profile")
120
- → { answer: "Application submitted successfully" }
114
+ browser_message(s.session_id, "Click into the Cohere job and tell me the requirements")
115
+ browser_message(s.session_id, "Apply to this job using my profile")
121
116
  ```
122
117
 
123
118
  **Parallel execution:**
124
119
  ```
125
120
  browser_start("Check flight prices to Tokyo")
126
121
  browser_start("Check hotel prices in Shibuya")
127
- browser_start("Look up train pass costs")
128
- // All three run simultaneously
122
+ // Both run simultaneously in separate windows
129
123
  ```
130
124
 
131
125
  ## Configuration
132
126
 
133
127
  | Environment Variable | Default | Description |
134
128
  |---|---|---|
135
- | `HANZI_IN_CHROME_MAX_SESSIONS` | `5` | Max concurrent browser tasks |
129
+ | `HANZI_BROWSE_MAX_SESSIONS` | `5` | Max concurrent browser tasks |
130
+ | `HANZI_BROWSE_TIMEOUT_MS` | `300000` | Task timeout (ms) |
136
131
  | `WS_RELAY_PORT` | `7862` | WebSocket relay port |
132
+ | `POSTHOG_API_KEY` | unset | Enables PostHog analytics for local CLI telemetry, the dashboard build, the managed backend, and example apps |
133
+ | `POSTHOG_HOST` | `https://us.i.posthog.com` | Override the PostHog host for all server-side capture calls and dashboard initialization |
137
134
 
138
- ## Architecture
135
+ ## Skills
139
136
 
140
- ```
141
- AI Tool (Claude Code, Cursor, etc.)
142
- ↓ MCP Protocol (stdio)
143
- MCP Server
144
- ↓ WebSocket
145
- Relay Server
146
- ↓ WebSocket
147
- Chrome Extension
148
- ↓ Extension agent loop
149
- Target Website
150
- ```
151
-
152
- The relay server starts automatically when the MCP server connects. It routes
153
- messages between the MCP server and the Chrome extension and briefly queues
154
- messages while the extension service worker is asleep.
155
-
156
- > **Principle**: Hanzi is for real browser work in your signed-in Chrome.
157
- > Agents should prefer code, logs, APIs, and existing tools first. Use Hanzi when the job needs a real browser session.
158
-
159
- ## Prompts
160
-
161
- The server exposes MCP prompts that clients auto-discover as slash commands:
137
+ The server exposes MCP prompts that clients auto-discover:
162
138
 
163
139
  | Prompt | Description |
164
140
  |--------|-------------|
165
- | `linkedin-prospector` | Goal-driven LinkedIn outreach — networking, sales, partnerships, or hiring |
166
- | `e2e-tester` | Test your app in a real browser — reports bugs with screenshots and code references |
167
- | `social-poster` | Post across LinkedIn, Twitter, Reddit, HN — drafts per-platform, posts from your browser |
168
-
169
- In Claude Code, use the built-in `linkedin-prospector` prompt from the MCP prompt list.
170
-
171
- ## Skills CLI
141
+ | `linkedin-prospector` | Goal-driven LinkedIn outreach |
142
+ | `e2e-tester` | Test your app in a real browser with screenshots |
143
+ | `social-poster` | Post across LinkedIn, Twitter, Reddit from your browser |
144
+ | `x-marketer` | Find X/Twitter conversations and draft voice-matched replies |
172
145
 
173
146
  ```bash
174
147
  hanzi-browser skills # list available skills
175
148
  hanzi-browser skills install linkedin-prospector # install SKILL.md to your project
176
149
  ```
177
150
 
178
- Skills are portable SKILL.md files for agents that don't support MCP prompts (Cline, Codex). Each skill follows the same principle: use existing tools first, Hanzi only for real browser steps.
151
+ ## Architecture
152
+
153
+ ```
154
+ AI Agent (Claude Code, Cursor, etc.)
155
+ ↓ MCP Protocol (stdio)
156
+ MCP Server (this package)
157
+ ↓ WebSocket
158
+ Chrome Extension
159
+ ↓ Chrome DevTools Protocol
160
+ User's Real Browser
161
+ ```
162
+
163
+ > **Principle**: Hanzi is for real browser work in your signed-in Chrome.
164
+ > Agents should prefer code, logs, APIs, and existing tools first. Use Hanzi when the job needs a real browser session.
179
165
 
180
166
  ## License
181
167
 
@@ -1,10 +1,11 @@
1
1
  /**
2
- * Domain-specific knowledge for the server-side agent loop.
3
- * Matches the extension's domain-skills.js but only includes domains
4
- * relevant to managed/API tasks.
2
+ * Domain-specific knowledge for the agent loop.
3
+ * Single source of truth shared between server (managed API, MCP)
4
+ * and extension (via import at build time).
5
5
  */
6
6
  interface DomainEntry {
7
7
  domain: string;
8
+ antiBot?: boolean;
8
9
  skill: string;
9
10
  }
10
11
  /**
@@ -12,4 +13,8 @@ interface DomainEntry {
12
13
  * Returns the first matching entry, or null.
13
14
  */
14
15
  export declare function getDomainSkill(url: string): DomainEntry | null;
16
+ /**
17
+ * Get all domain skills. Used by extension to import the full list.
18
+ */
19
+ export declare function getAllDomainSkills(): DomainEntry[];
15
20
  export {};
@@ -1,51 +1,15 @@
1
1
  /**
2
- * Domain-specific knowledge for the server-side agent loop.
3
- * Matches the extension's domain-skills.js but only includes domains
4
- * relevant to managed/API tasks.
2
+ * Domain-specific knowledge for the agent loop.
3
+ * Single source of truth shared between server (managed API, MCP)
4
+ * and extension (via import at build time).
5
5
  */
6
- const DOMAIN_KNOWLEDGE = [
7
- {
8
- domain: "x.com",
9
- skill: `X/Twitter verified patterns (updated 2026-03-30)
10
-
11
- ## Reading pages (CRITICAL)
12
- - X loads content asynchronously — page looks empty for 3-5 seconds after navigation.
13
- - read_page often returns ONLY "To view keyboard shortcuts" — tweets haven't loaded yet.
14
- - DO NOT re-navigate to the same URL. That resets loading and makes it worse.
15
- - Instead: wait 5 seconds, then use get_page_text — it reads visible text and is more reliable.
16
- - If get_page_text returns nothing, scroll down once and try again.
17
-
18
- ## Search
19
- - URL: x.com/search?q={encoded_query}&src=typed_query&f=live
20
- - After navigating, wait 5 seconds, then get_page_text (NOT read_page).
21
- - Scroll down once to load more tweets, then get_page_text again.
22
- - Tweet URLs in page text follow pattern: /status/{id}
23
-
24
- ## Text input (CRITICAL — Draft.js)
25
- - form_input DOES NOT WORK — Draft.js ignores programmatic input.
26
- - computer type action GARBLES TEXT.
27
- - ONLY RELIABLE METHOD — use javascript_tool:
28
- document.querySelector('[data-testid="tweetTextarea_0"]').focus();
29
- document.execCommand('insertText', false, 'your reply text here');
30
- - Always verify text appeared by reading after insertion.
31
-
32
- ## Replying to a tweet
33
- 1. Navigate to tweet URL (x.com/{handle}/status/{id})
34
- 2. Wait 3 seconds, read the page
35
- 3. Click the reply/comment icon (speech bubble) in the action bar
36
- 4. Use javascript_tool to insert text (see above)
37
- 5. Verify text appeared, then click blue "Reply" button
38
- 6. Wait 2 seconds to confirm reply posted
39
-
40
- ## Known traps
41
- - DO NOT scroll looking for "Post your reply" — reply box appears after clicking comment icon
42
- - x.com/compose/post may open — that's fine, type and click Reply there
43
- - "Leave site?" dialog — ALWAYS click Cancel, finish posting first
44
- - Reply button is disabled until text is entered — verify first
45
- - Space replies 15+ seconds apart (rate limiting)
46
- - NEVER navigate to the same URL you're already on`,
47
- },
48
- ];
6
+ import { readFileSync } from "fs";
7
+ import { fileURLToPath } from "url";
8
+ import { dirname, join } from "path";
9
+ // Load from shared JSON file
10
+ const __filename = fileURLToPath(import.meta.url);
11
+ const __dirname = dirname(__filename);
12
+ const DOMAIN_SKILLS = JSON.parse(readFileSync(join(__dirname, "domain-skills.json"), "utf-8"));
49
13
  /**
50
14
  * Look up domain knowledge for a URL.
51
15
  * Returns the first matching entry, or null.
@@ -53,11 +17,17 @@ const DOMAIN_KNOWLEDGE = [
53
17
  export function getDomainSkill(url) {
54
18
  try {
55
19
  const hostname = new URL(url).hostname.toLowerCase();
56
- return DOMAIN_KNOWLEDGE.find((d) => hostname === d.domain || hostname.endsWith("." + d.domain)) || null;
20
+ return DOMAIN_SKILLS.find((d) => hostname === d.domain || hostname.endsWith("." + d.domain)) || null;
57
21
  }
58
22
  catch {
59
23
  // URL might not be a full URL — try matching as a bare domain
60
24
  const lower = url.toLowerCase();
61
- return DOMAIN_KNOWLEDGE.find((d) => lower.includes(d.domain)) || null;
25
+ return DOMAIN_SKILLS.find((d) => lower.includes(d.domain)) || null;
62
26
  }
63
27
  }
28
+ /**
29
+ * Get all domain skills. Used by extension to import the full list.
30
+ */
31
+ export function getAllDomainSkills() {
32
+ return DOMAIN_SKILLS;
33
+ }
@@ -0,0 +1,92 @@
1
+ [
2
+ {
3
+ "domain": "mail.google.com",
4
+ "skill": "Gmail best practices:\n- To open an email, click directly on the email subject/preview text, NOT the checkbox or star\n- Use keyboard shortcuts: 'c' to compose, 'r' to reply, 'a' to reply all, 'f' to forward, 'e' to archive\n- To search, use the search bar at the top with operators like 'from:', 'to:', 'subject:', 'is:unread'\n- Reading pane may be on the right or below depending on user settings - check which layout is active\n- Verification codes are often in emails from 'noreply@' addresses with subjects containing 'verification', 'code', or 'confirm'"
5
+ },
6
+ {
7
+ "domain": "docs.google.com",
8
+ "skill": "Google Docs best practices:\n- This is a canvas-based application - use screenshots to see content, read_page may not capture all text\n- Use keyboard shortcuts: Cmd/Ctrl+B for bold, Cmd/Ctrl+I for italic, Cmd/Ctrl+K for links\n- To navigate, use Cmd/Ctrl+F to find text, then click on the result\n- For editing, click to place cursor then type - triple-click to select a paragraph\n- Access menus via the menu bar at the top (File, Edit, View, Insert, Format, etc.)"
9
+ },
10
+ {
11
+ "domain": "sheets.google.com",
12
+ "skill": "Google Sheets best practices:\n- Click on cells to select them, double-click to edit cell content\n- Use Tab to move right, Enter to move down, arrow keys to navigate\n- Formulas start with '=' - e.g., =SUM(A1:A10), =VLOOKUP(), =IF()\n- Use Cmd/Ctrl+C and Cmd/Ctrl+V for copy/paste\n- Select ranges by clicking and dragging, or Shift+click for range selection"
13
+ },
14
+ {
15
+ "domain": "github.com",
16
+ "skill": "GitHub best practices:\n- Repository navigation: Code tab for files, Issues for bug tracking, Pull requests for code review\n- To view a file, click on the filename in the file tree\n- Use 't' to open file finder, 'l' to jump to a line\n- In PRs: 'Files changed' tab shows diffs, 'Conversation' tab shows comments\n- Use the search bar with qualifiers: 'is:open is:pr', 'is:issue label:bug'"
17
+ },
18
+ {
19
+ "domain": "reddit.com",
20
+ "antiBot": true,
21
+ "skill": "Reddit UI patterns:\n- Posts are listed in a feed - click on post title to view full post and comments\n- Comments are nested/threaded - each comment has its own reply button underneath\n- Upvote (up arrow) and downvote (down arrow) buttons are to the left of each post/comment\n- To comment, scroll to comment box at top of comments section, or click reply under a specific comment\n- Use the search bar at top to find subreddits or posts\n- r/subredditname format for community names"
22
+ },
23
+ {
24
+ "domain": "linkedin.com",
25
+ "antiBot": true,
26
+ "skill": "LinkedIn UI patterns:\n\n## Messaging & Connections\n- To message someone: first check if you're connected (1st degree) - if not, send a connection request first\n- Connection request: go to their profile, click 'Connect' button, optionally add a note\n- Once connected, use the 'Message' button on their profile or go to Messaging tab\n- InMail (messaging non-connections) requires Premium subscription\n\n## Easy Apply Forms\n- Contact Info page is pre-filled from LinkedIn profile - don't try to modify, just click Next\n- Modal forms may need scrolling to see all content and buttons\n- Use screenshots over read_page for modals - accessibility tree often misses modal content\n\n## Navigation\n- Main tabs: Home (feed), My Network, Jobs, Messaging, Notifications\n- Job search: Jobs tab → filter by location, experience level, date posted\n- 'Easy Apply' = apply within LinkedIn; 'Apply' = external site\n- Profile sections are collapsible - click 'Show all' to expand"
27
+ },
28
+ {
29
+ "domain": "indeed.com",
30
+ "skill": "Indeed best practices:\n- Search for jobs using the 'What' and 'Where' fields at the top\n- Filter results by date posted, salary, job type, experience level\n- Click job title to view full description\n- 'Apply now' or 'Apply on company site' buttons are typically on the right panel\n- Sign in to save jobs and track applications"
31
+ },
32
+ {
33
+ "domain": "calendar.google.com",
34
+ "skill": "Google Calendar best practices:\n- Click on a time slot to create a new event\n- Drag events to reschedule them\n- Click on an event to view details, edit, or delete\n- Use the mini calendar on the left to navigate to different dates\n- Keyboard: 'c' to create event, 't' to go to today, arrow keys to navigate"
35
+ },
36
+ {
37
+ "domain": "drive.google.com",
38
+ "skill": "Google Drive best practices:\n- Double-click files to open them, single-click to select\n- Right-click for context menu (download, share, rename, etc.)\n- Use the search bar to find files by name or content\n- Create new items with the '+ New' button on the left\n- Drag and drop to move files between folders"
39
+ },
40
+ {
41
+ "domain": "notion.so",
42
+ "skill": "Notion best practices:\n- Click to place cursor, type '/' to open command menu\n- Drag blocks using the ⋮⋮ handle on the left\n- Use sidebar for navigation between pages\n- Toggle blocks expand/collapse on click\n- Databases can be viewed as table, board, calendar, etc."
43
+ },
44
+ {
45
+ "domain": "figma.com",
46
+ "skill": "Figma best practices:\n- This is a canvas-based design tool - always use screenshots to see content\n- Use 'V' for select tool, 'R' for rectangle, 'T' for text\n- Zoom with Cmd/Ctrl+scroll or Cmd/Ctrl++ and Cmd/Ctrl+-\n- Navigate frames in the left sidebar\n- Right-click for context menus and additional options"
47
+ },
48
+ {
49
+ "domain": "slack.com",
50
+ "skill": "Slack best practices:\n- Channels listed in left sidebar - click to switch\n- Cmd/Ctrl+K to quickly switch channels/DMs\n- @ mentions notify users, # references channels\n- Thread replies keep conversations organized\n- Use the search bar to find messages, files, and people"
51
+ },
52
+ {
53
+ "domain": "twitter.com",
54
+ "antiBot": true,
55
+ "skill": "See x.com — twitter.com redirects to x.com."
56
+ },
57
+ {
58
+ "domain": "x.com",
59
+ "antiBot": true,
60
+ "skill": "X/Twitter — verified patterns (updated 2026-03-30)\n\n## Reading pages (CRITICAL)\n- X loads content asynchronously — page looks empty for 3-5 seconds after navigation.\n- read_page often returns ONLY \"To view keyboard shortcuts\" — tweets haven't loaded yet.\n- DO NOT re-navigate to the same URL. That resets loading and makes it worse.\n- Instead: wait 5 seconds, then use get_page_text — it reads visible text and is more reliable.\n- If get_page_text returns nothing, scroll down once and try again.\n\n## Search\n- URL: x.com/search?q={encoded_query}&src=typed_query&f=live\n- After navigating, wait 5 seconds, then get_page_text (NOT read_page).\n- Scroll down once to load more tweets, then get_page_text again.\n- Tweet URLs in page text follow pattern: /status/{id}\n\n## Text input (CRITICAL — Draft.js)\n- form_input DOES NOT WORK — Draft.js ignores programmatic input.\n- computer type action GARBLES TEXT.\n- ONLY RELIABLE METHOD — use javascript_tool:\n document.querySelector('[data-testid=\"tweetTextarea_0\"]').focus();\n document.execCommand('insertText', false, 'your reply text here');\n- Always verify text appeared by reading after insertion.\n\n## Replying to a tweet\n1. Navigate to tweet URL (x.com/{handle}/status/{id})\n2. Wait 3 seconds, read the page\n3. Click the reply/comment icon (speech bubble) in the action bar\n4. Use javascript_tool to insert text (see above)\n5. Verify text appeared, then click blue \"Reply\" button\n6. Wait 2 seconds to confirm reply posted\n\n## Known traps\n- DO NOT scroll looking for \"Post your reply\" — reply box appears after clicking comment icon\n- x.com/compose/post may open — that's fine, type and click Reply there\n- \"Leave site?\" dialog — ALWAYS click Cancel, finish posting first\n- Reply button is disabled until text is entered — verify first\n- Space replies 15+ seconds apart (rate limiting)\n- NEVER navigate to the same URL you're already on"
61
+ },
62
+ {
63
+ "domain": "amazon.com",
64
+ "skill": "Amazon UI patterns:\n- Prefer the top search bar or a direct search URL for product discovery\n- The first fold may be dominated by sponsored modules or Amazon brand carousels before standard results\n- Result cards can show total price, unit price, ratings, delivery promises, variation links, and inline Add to Cart buttons all at once\n- Watch carefully for subtle labels such as 'Sponsored' or 'Featured from Amazon brands'\n- Location/shipping prompts can change delivery wording and result context, so verify destination-sensitive text before extracting prices"
65
+ },
66
+ {
67
+ "domain": "ebay.com",
68
+ "skill": "eBay UI patterns:\n- Search results usually show title, condition, price, shipping, and location directly on the card\n- Headline price is often incomplete until you also check shipping cost, coupon text, and 'Best Offer' language\n- Sponsored placements and featured carousels can interrupt the organic result list\n- Use condition and shipping origin early to filter out weak matches\n- Be cautious with range prices and urgency labels like 'Last one' or watcher counts; they do not guarantee a clean fixed-price purchase path"
69
+ },
70
+ {
71
+ "domain": "walmart.ca",
72
+ "antiBot": true,
73
+ "skill": "Walmart.ca UI patterns:\n- Search results often expose enough information to compare products before opening a product page\n- Cards commonly include brand, title, price, ratings, fulfillment messaging, and promo details such as Subscribe to Save or pickup thresholds\n- Walmart.com may first present a storefront or region-selection step and later a human-verification blocker ('Press & Hold' challenge); validation was more reliable on Walmart.ca\n- On Walmart.ca product pages, verify one-time price separately from Subscribe to Save pricing and check pickup, express delivery, and standard delivery messages individually\n- Treat promo labels, fulfillment thresholds, and returns messaging as separate signals and verify them individually\n- If you hit a 'Press & Hold' or 'verify you are human' challenge, STOP and tell the user — never attempt to bypass"
74
+ },
75
+ {
76
+ "domain": "target.com",
77
+ "skill": "Target UI patterns:\n- Result cards usually include title, review count, price, promo messaging, and a persistent Add to Cart button\n- Price formatting can mix ranges, sale pricing, regular pricing, and per-unit math, so read the full price block carefully\n- Promo text such as Target gift card offers or 'Highly rated' badges appears frequently and can distract from the base price\n- Fulfillment filters are easy to see near the top, but card-level availability may still need a deeper click\n- Search results can include adjacent-category items, so confirm the exact product type from the title before acting"
78
+ },
79
+ {
80
+ "domain": "zillow.com",
81
+ "antiBot": true,
82
+ "skill": "Zillow UI patterns (rentals):\n- Sign in first for full contact info and saved searches; anonymous users see degraded data\n- Use 'For Rent' in the top nav, then refine by price, beds, and 'Move-in Date'\n- Results have a split map + list view — switch to list view for cleaner data extraction\n- Listing cards include price, beds/baths, address, and 'Apply' or 'Request a tour' buttons\n- 'Request a tour' and 'Contact' buttons submit forms — draft the message and wait for explicit user approval before clicking submit\n- Never enter SSN, payment info, or background check data without confirming the user is on an official Zillow Application flow\n- Zillow frequently serves a CAPTCHA / 'Press & Hold' challenge for automated traffic — if you hit one, STOP and tell the user, do not bypass"
83
+ },
84
+ {
85
+ "domain": "apartments.com",
86
+ "skill": "Apartments.com UI patterns:\n- Top filter bar: location, price, beds, move-in date, amenities — apply them before reading results\n- Listing cards include price range, beds/baths, address, and a 'Send Message' or 'Contact' button\n- Many listings have a built-in multi-step application flow — each step is clearly labeled; stop before 'Submit Application' and confirm with the user\n- 'Send Message' opens an inline form — draft the message, show it to the user, wait for approval before submitting\n- Pricing is often shown as a range ($X-$Y) because individual units vary; click into the listing for per-unit prices\n- Response times for large property managers are usually fast (hours); individual landlords slower\n- Do not enter SSN, credit-card, or bank info without explicit user confirmation that they want to proceed with a real application"
87
+ },
88
+ {
89
+ "domain": "craigslist.org",
90
+ "skill": "Craigslist UI patterns (apartments / housing):\n- URL pattern: https://[city].craigslist.org/search/apa?minAsk={min}&maxAsk={max}&bedrooms={n}\n- Sort by 'newest' to avoid stale listings — older listings are frequently scams or already rented\n- Listings may or may not have photos; listings without photos are higher risk, flag them\n- Contact is always via an anonymized email relay — no built-in application flow\n- Scam flags to warn the user about: price >25% below market, 'owner overseas', 'send deposit to hold', asks to text/WhatsApp before viewing, generic stock photos\n- Never follow links off craigslist for a 'verified listings site' — those are almost always phishing\n- Show the user any inquiry message before sending, and wait for explicit approval"
91
+ }
92
+ ]
@@ -13,9 +13,13 @@ export function buildSystemPrompt(taskUrl) {
13
13
  const blocks = [
14
14
  {
15
15
  type: "text",
16
- text: `You are a web automation assistant with browser tools. Your priority is to complete the user's request efficiently and autonomously.
16
+ text: `You are Hanzi Browse, a browsing sub-agent driving the user's own Chrome browser — with all their logins, cookies, and sessions already in place. A host agent (Claude Code, Cursor, Codex, etc.) has delegated a task to you. Your job is to complete that task autonomously using the browser tools below, and return a concise answer.
17
17
 
18
- Browser tasks often require long-running, agentic capabilities. When you encounter a user request that feels time-consuming or extensive in scope, you should be persistent and use all available context needed to accomplish the task. The user expects you to work autonomously until the task is complete. Do not ask for permission - just do it.
18
+ You are NOT a step-by-step executor reading a script. You are an agent. You decide what to click, what to type, what to wait for, and when you're done. The host agent gave you a goal in natural language; figure out the steps yourself and complete the goal.
19
+
20
+ When the host agent sends you a follow-up via browser_message, it's course-correcting or refining the task — treat it as the latest instruction from the user and continue from the current browser state.
21
+
22
+ You are persistent. Long or multi-step tasks are expected. The host agent expects you to work until the task is complete. Do not ask for permission — just do it.
19
23
 
20
24
  <behavior_instructions>
21
25
  The current date is ${dateStr}, ${timeStr}.
@@ -10,6 +10,8 @@ interface AgentConfig {
10
10
  method: 'json-merge' | 'cli-command';
11
11
  detect: () => boolean;
12
12
  configPath?: () => string;
13
+ configSection?: 'mcpServers' | 'servers' | 'context_servers';
14
+ legacyConfigSections?: ('mcpServers' | 'servers' | 'context_servers')[];
13
15
  cliCommand?: string;
14
16
  skillsDir?: () => string;
15
17
  }
@@ -22,6 +24,7 @@ interface AgentRegistryDeps {
22
24
  home?: string;
23
25
  plat?: NodeJS.Platform;
24
26
  appData?: string;
27
+ xdgConfigHome?: string;
25
28
  pathExists?: (path: string) => boolean;
26
29
  runCommand?: (command: string, options?: any) => Buffer | string;
27
30
  }
@@ -41,6 +44,7 @@ interface BrowserDetectionDeps {
41
44
  }
42
45
  export declare function getAgentRegistry(deps?: AgentRegistryDeps): AgentConfig[];
43
46
  export declare function mergeJsonConfig(configPath: string, deps?: JsonConfigDeps): SetupResult;
47
+ export declare function mergeJsonConfigAtKey(configPath: string, configSection: 'mcpServers' | 'servers' | 'context_servers', deps?: JsonConfigDeps, legacyConfigSections?: ('mcpServers' | 'servers' | 'context_servers')[]): SetupResult;
44
48
  interface BrowserInfo {
45
49
  name: string;
46
50
  slug: string;
@@ -57,5 +61,7 @@ export declare function buildSystemOpenCommand(url: string, plat: NodeJS.Platfor
57
61
  export declare function runSetup(options?: {
58
62
  only?: string;
59
63
  yes?: boolean;
64
+ all?: boolean;
65
+ skills?: string[];
60
66
  }): Promise<void>;
61
67
  export {};