hanzi-browse 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +63 -79
- package/dist/agent/domain-knowledge.d.ts +20 -0
- package/dist/agent/domain-knowledge.js +33 -0
- package/dist/agent/domain-skills.json +66 -0
- package/dist/agent/loop.d.ts +12 -0
- package/dist/agent/loop.js +41 -1
- package/dist/agent/system-prompt.d.ts +1 -1
- package/dist/agent/system-prompt.js +12 -2
- package/dist/cli/json-output.d.ts +21 -0
- package/dist/cli/json-output.js +30 -0
- package/dist/cli/setup.d.ts +51 -0
- package/dist/cli/setup.js +113 -41
- package/dist/cli.js +29 -8
- package/dist/index.js +1 -567
- package/dist/managed/api.d.ts +20 -1
- package/dist/managed/api.js +181 -521
- package/dist/managed/auth.js +0 -5
- package/dist/managed/deploy.js +82 -0
- package/dist/managed/routes/api.d.ts +44 -0
- package/dist/managed/routes/api.js +220 -0
- package/dist/managed/routes/pages.d.ts +13 -0
- package/dist/managed/routes/pages.js +149 -0
- package/dist/managed/store-pg.d.ts +5 -1
- package/dist/managed/store-pg.js +12 -4
- package/dist/managed/store.d.ts +6 -1
- package/dist/managed/store.js +4 -2
- package/dist/managed/templates/pair-self.html +67 -0
- package/dist/managed/templates/pair.html +97 -0
- package/dist/mcp/tools.d.ts +20 -0
- package/dist/mcp/tools.js +263 -0
- package/dist/relay/api-proxy.d.ts +2 -0
- package/dist/relay/api-proxy.js +165 -0
- package/dist/relay/server.js +2 -112
- package/package.json +3 -3
- package/skills/competitor-monitor/SKILL.md +290 -0
- package/skills/data-extractor/SKILL.md +223 -0
- package/skills/job-applier/SKILL.md +260 -0
- package/skills/seo-checker/SKILL.md +146 -0
package/README.md
CHANGED
|
@@ -1,45 +1,51 @@
|
|
|
1
|
-
# Hanzi Browse
|
|
1
|
+
# Hanzi Browse
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
the Chrome extension over the local WebSocket relay.
|
|
3
|
+
Give your AI agent a real browser — with your existing logins, cookies, and sessions.
|
|
5
4
|
|
|
6
|
-
|
|
5
|
+
**Two ways to use it:**
|
|
6
|
+
- **Use locally** — MCP server for Claude Code, Cursor, Codex, and other AI coding agents
|
|
7
|
+
- **Build with it** — REST API + TypeScript SDK for embedding browser automation in your product
|
|
8
|
+
|
|
9
|
+
## Quick Start (MCP)
|
|
7
10
|
|
|
8
11
|
```bash
|
|
9
|
-
|
|
10
|
-
npm install
|
|
11
|
-
npm run build
|
|
12
|
+
npx hanzi-browse setup
|
|
12
13
|
```
|
|
13
14
|
|
|
14
|
-
|
|
15
|
+
This installs the Chrome extension and configures your AI agent. One command, done.
|
|
15
16
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
}
|
|
23
|
-
}
|
|
24
|
-
}
|
|
17
|
+
**Prerequisites:** Chrome must be open with the [Hanzi extension](https://chromewebstore.google.com/detail/hanzi-browse/iklpkemlmbhemkiojndpbhoakgikpmcd) installed.
|
|
18
|
+
|
|
19
|
+
## Quick Start (API)
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
npm install @hanzi/browser-agent
|
|
25
23
|
```
|
|
26
24
|
|
|
27
|
-
|
|
25
|
+
```typescript
|
|
26
|
+
import { HanziClient } from '@hanzi/browser-agent';
|
|
28
27
|
|
|
29
|
-
|
|
28
|
+
const client = new HanziClient({ apiKey: 'hic_live_...' });
|
|
30
29
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
30
|
+
// 1. Pair a browser — give the URL to your user
|
|
31
|
+
const { pairingToken } = await client.createPairingToken();
|
|
32
|
+
// User visits: https://api.hanzilla.co/pair/{pairingToken}
|
|
33
|
+
|
|
34
|
+
// 2. Find their connected session
|
|
35
|
+
const sessions = await client.listSessions();
|
|
36
|
+
const browser = sessions.find(s => s.status === 'connected');
|
|
37
|
+
|
|
38
|
+
// 3. Run a task (polls until complete)
|
|
39
|
+
const result = await client.runTask({
|
|
40
|
+
browserSessionId: browser.id,
|
|
41
|
+
task: 'Go to example.com and read the page title',
|
|
42
|
+
});
|
|
43
|
+
console.log(result.answer);
|
|
37
44
|
```
|
|
38
45
|
|
|
39
|
-
|
|
40
|
-
tool calls, local session bookkeeping, and blocking waits for completion.
|
|
46
|
+
Full API docs: [browse.hanzilla.co/docs.html](https://browse.hanzilla.co/docs.html)
|
|
41
47
|
|
|
42
|
-
## Tools
|
|
48
|
+
## MCP Tools
|
|
43
49
|
|
|
44
50
|
### `browser_start`
|
|
45
51
|
|
|
@@ -55,7 +61,6 @@ browser_start(
|
|
|
55
61
|
→ {
|
|
56
62
|
"session_id": "abc123",
|
|
57
63
|
"status": "complete",
|
|
58
|
-
"task": "Search for flights to Tokyo...",
|
|
59
64
|
"answer": "Found 3 flights: JAL $850, ANA $920, United $780",
|
|
60
65
|
"total_steps": 8,
|
|
61
66
|
"recent_steps": ["Opened Google Flights", "Set destination to Tokyo", ...]
|
|
@@ -64,7 +69,7 @@ browser_start(
|
|
|
64
69
|
|
|
65
70
|
### `browser_message`
|
|
66
71
|
|
|
67
|
-
Send follow-up instructions to an existing session.
|
|
72
|
+
Send follow-up instructions to an existing session.
|
|
68
73
|
|
|
69
74
|
```
|
|
70
75
|
browser_message(session_id: "abc123", message: "Book the cheapest one")
|
|
@@ -75,7 +80,7 @@ browser_message(session_id: "abc123", message: "Book the cheapest one")
|
|
|
75
80
|
Check known sessions and their latest status.
|
|
76
81
|
|
|
77
82
|
```
|
|
78
|
-
browser_status()
|
|
83
|
+
browser_status() // all active sessions
|
|
79
84
|
browser_status(session_id: "abc123") // specific session
|
|
80
85
|
```
|
|
81
86
|
|
|
@@ -85,7 +90,7 @@ Stop a task.
|
|
|
85
90
|
|
|
86
91
|
```
|
|
87
92
|
browser_stop(session_id: "abc123")
|
|
88
|
-
browser_stop(session_id: "abc123", remove: true) // also
|
|
93
|
+
browser_stop(session_id: "abc123", remove: true) // also close window
|
|
89
94
|
```
|
|
90
95
|
|
|
91
96
|
### `browser_screenshot`
|
|
@@ -98,84 +103,63 @@ browser_screenshot(session_id: "abc123")
|
|
|
98
103
|
|
|
99
104
|
## Examples
|
|
100
105
|
|
|
101
|
-
**Research:**
|
|
102
|
-
```
|
|
103
|
-
browser_start("Find the top 3 competitors for Acme Corp and summarize their pricing")
|
|
104
|
-
```
|
|
105
|
-
|
|
106
106
|
**Logged-in workflows:**
|
|
107
107
|
```
|
|
108
|
-
browser_start("Go to Jira, find my open tickets, and summarize what needs attention
|
|
108
|
+
browser_start("Go to Jira, find my open tickets, and summarize what needs attention")
|
|
109
109
|
```
|
|
110
110
|
|
|
111
111
|
**Multi-turn:**
|
|
112
112
|
```
|
|
113
113
|
s = browser_start("Go to LinkedIn and find AI Engineer jobs in Montreal")
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
browser_message("x1", "Click into that job and tell me the requirements")
|
|
117
|
-
→ { answer: "Requirements: 3+ years Python, ML experience..." }
|
|
118
|
-
|
|
119
|
-
browser_message("x1", "Apply to this job using my profile")
|
|
120
|
-
→ { answer: "Application submitted successfully" }
|
|
114
|
+
browser_message(s.session_id, "Click into the Cohere job and tell me the requirements")
|
|
115
|
+
browser_message(s.session_id, "Apply to this job using my profile")
|
|
121
116
|
```
|
|
122
117
|
|
|
123
118
|
**Parallel execution:**
|
|
124
119
|
```
|
|
125
120
|
browser_start("Check flight prices to Tokyo")
|
|
126
121
|
browser_start("Check hotel prices in Shibuya")
|
|
127
|
-
|
|
128
|
-
// All three run simultaneously
|
|
122
|
+
// Both run simultaneously in separate windows
|
|
129
123
|
```
|
|
130
124
|
|
|
131
125
|
## Configuration
|
|
132
126
|
|
|
133
127
|
| Environment Variable | Default | Description |
|
|
134
128
|
|---|---|---|
|
|
135
|
-
| `
|
|
129
|
+
| `HANZI_BROWSE_MAX_SESSIONS` | `5` | Max concurrent browser tasks |
|
|
130
|
+
| `HANZI_BROWSE_TIMEOUT_MS` | `300000` | Task timeout (ms) |
|
|
136
131
|
| `WS_RELAY_PORT` | `7862` | WebSocket relay port |
|
|
137
132
|
|
|
138
|
-
##
|
|
133
|
+
## Skills
|
|
139
134
|
|
|
140
|
-
|
|
141
|
-
AI Tool (Claude Code, Cursor, etc.)
|
|
142
|
-
↓ MCP Protocol (stdio)
|
|
143
|
-
MCP Server
|
|
144
|
-
↓ WebSocket
|
|
145
|
-
Relay Server
|
|
146
|
-
↓ WebSocket
|
|
147
|
-
Chrome Extension
|
|
148
|
-
↓ Extension agent loop
|
|
149
|
-
Target Website
|
|
150
|
-
```
|
|
151
|
-
|
|
152
|
-
The relay server starts automatically when the MCP server connects. It routes
|
|
153
|
-
messages between the MCP server and the Chrome extension and briefly queues
|
|
154
|
-
messages while the extension service worker is asleep.
|
|
155
|
-
|
|
156
|
-
> **Principle**: Hanzi is for real browser work in your signed-in Chrome.
|
|
157
|
-
> Agents should prefer code, logs, APIs, and existing tools first. Use Hanzi when the job needs a real browser session.
|
|
158
|
-
|
|
159
|
-
## Prompts
|
|
160
|
-
|
|
161
|
-
The server exposes MCP prompts that clients auto-discover as slash commands:
|
|
135
|
+
The server exposes MCP prompts that clients auto-discover:
|
|
162
136
|
|
|
163
137
|
| Prompt | Description |
|
|
164
138
|
|--------|-------------|
|
|
165
|
-
| `linkedin-prospector` | Goal-driven LinkedIn outreach
|
|
166
|
-
| `e2e-tester` | Test your app in a real browser
|
|
167
|
-
| `social-poster` | Post across LinkedIn, Twitter, Reddit
|
|
168
|
-
|
|
169
|
-
In Claude Code, use the built-in `linkedin-prospector` prompt from the MCP prompt list.
|
|
170
|
-
|
|
171
|
-
## Skills CLI
|
|
139
|
+
| `linkedin-prospector` | Goal-driven LinkedIn outreach |
|
|
140
|
+
| `e2e-tester` | Test your app in a real browser with screenshots |
|
|
141
|
+
| `social-poster` | Post across LinkedIn, Twitter, Reddit from your browser |
|
|
142
|
+
| `x-marketer` | Find X/Twitter conversations and draft voice-matched replies |
|
|
172
143
|
|
|
173
144
|
```bash
|
|
174
145
|
hanzi-browser skills # list available skills
|
|
175
146
|
hanzi-browser skills install linkedin-prospector # install SKILL.md to your project
|
|
176
147
|
```
|
|
177
148
|
|
|
178
|
-
|
|
149
|
+
## Architecture
|
|
150
|
+
|
|
151
|
+
```
|
|
152
|
+
AI Agent (Claude Code, Cursor, etc.)
|
|
153
|
+
↓ MCP Protocol (stdio)
|
|
154
|
+
MCP Server (this package)
|
|
155
|
+
↓ WebSocket
|
|
156
|
+
Chrome Extension
|
|
157
|
+
↓ Chrome DevTools Protocol
|
|
158
|
+
User's Real Browser
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
> **Principle**: Hanzi is for real browser work in your signed-in Chrome.
|
|
162
|
+
> Agents should prefer code, logs, APIs, and existing tools first. Use Hanzi when the job needs a real browser session.
|
|
179
163
|
|
|
180
164
|
## License
|
|
181
165
|
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Domain-specific knowledge for the agent loop.
|
|
3
|
+
* Single source of truth — shared between server (managed API, MCP)
|
|
4
|
+
* and extension (via import at build time).
|
|
5
|
+
*/
|
|
6
|
+
interface DomainEntry {
|
|
7
|
+
domain: string;
|
|
8
|
+
antiBot?: boolean;
|
|
9
|
+
skill: string;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Look up domain knowledge for a URL.
|
|
13
|
+
* Returns the first matching entry, or null.
|
|
14
|
+
*/
|
|
15
|
+
export declare function getDomainSkill(url: string): DomainEntry | null;
|
|
16
|
+
/**
|
|
17
|
+
* Get all domain skills. Used by extension to import the full list.
|
|
18
|
+
*/
|
|
19
|
+
export declare function getAllDomainSkills(): DomainEntry[];
|
|
20
|
+
export {};
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Domain-specific knowledge for the agent loop.
|
|
3
|
+
* Single source of truth — shared between server (managed API, MCP)
|
|
4
|
+
* and extension (via import at build time).
|
|
5
|
+
*/
|
|
6
|
+
import { readFileSync } from "fs";
|
|
7
|
+
import { fileURLToPath } from "url";
|
|
8
|
+
import { dirname, join } from "path";
|
|
9
|
+
// Load from shared JSON file
|
|
10
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
11
|
+
const __dirname = dirname(__filename);
|
|
12
|
+
const DOMAIN_SKILLS = JSON.parse(readFileSync(join(__dirname, "domain-skills.json"), "utf-8"));
|
|
13
|
+
/**
|
|
14
|
+
* Look up domain knowledge for a URL.
|
|
15
|
+
* Returns the first matching entry, or null.
|
|
16
|
+
*/
|
|
17
|
+
export function getDomainSkill(url) {
|
|
18
|
+
try {
|
|
19
|
+
const hostname = new URL(url).hostname.toLowerCase();
|
|
20
|
+
return DOMAIN_SKILLS.find((d) => hostname === d.domain || hostname.endsWith("." + d.domain)) || null;
|
|
21
|
+
}
|
|
22
|
+
catch {
|
|
23
|
+
// URL might not be a full URL — try matching as a bare domain
|
|
24
|
+
const lower = url.toLowerCase();
|
|
25
|
+
return DOMAIN_SKILLS.find((d) => lower.includes(d.domain)) || null;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Get all domain skills. Used by extension to import the full list.
|
|
30
|
+
*/
|
|
31
|
+
export function getAllDomainSkills() {
|
|
32
|
+
return DOMAIN_SKILLS;
|
|
33
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"domain": "mail.google.com",
|
|
4
|
+
"skill": "Gmail best practices:\n- To open an email, click directly on the email subject/preview text, NOT the checkbox or star\n- Use keyboard shortcuts: 'c' to compose, 'r' to reply, 'a' to reply all, 'f' to forward, 'e' to archive\n- To search, use the search bar at the top with operators like 'from:', 'to:', 'subject:', 'is:unread'\n- Reading pane may be on the right or below depending on user settings - check which layout is active\n- Verification codes are often in emails from 'noreply@' addresses with subjects containing 'verification', 'code', or 'confirm'"
|
|
5
|
+
},
|
|
6
|
+
{
|
|
7
|
+
"domain": "docs.google.com",
|
|
8
|
+
"skill": "Google Docs best practices:\n- This is a canvas-based application - use screenshots to see content, read_page may not capture all text\n- Use keyboard shortcuts: Cmd/Ctrl+B for bold, Cmd/Ctrl+I for italic, Cmd/Ctrl+K for links\n- To navigate, use Cmd/Ctrl+F to find text, then click on the result\n- For editing, click to place cursor then type - triple-click to select a paragraph\n- Access menus via the menu bar at the top (File, Edit, View, Insert, Format, etc.)"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"domain": "sheets.google.com",
|
|
12
|
+
"skill": "Google Sheets best practices:\n- Click on cells to select them, double-click to edit cell content\n- Use Tab to move right, Enter to move down, arrow keys to navigate\n- Formulas start with '=' - e.g., =SUM(A1:A10), =VLOOKUP(), =IF()\n- Use Cmd/Ctrl+C and Cmd/Ctrl+V for copy/paste\n- Select ranges by clicking and dragging, or Shift+click for range selection"
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"domain": "github.com",
|
|
16
|
+
"skill": "GitHub best practices:\n- Repository navigation: Code tab for files, Issues for bug tracking, Pull requests for code review\n- To view a file, click on the filename in the file tree\n- Use 't' to open file finder, 'l' to jump to a line\n- In PRs: 'Files changed' tab shows diffs, 'Conversation' tab shows comments\n- Use the search bar with qualifiers: 'is:open is:pr', 'is:issue label:bug'"
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"domain": "reddit.com",
|
|
20
|
+
"antiBot": true,
|
|
21
|
+
"skill": "Reddit UI patterns:\n- Posts are listed in a feed - click on post title to view full post and comments\n- Comments are nested/threaded - each comment has its own reply button underneath\n- Upvote (up arrow) and downvote (down arrow) buttons are to the left of each post/comment\n- To comment, scroll to comment box at top of comments section, or click reply under a specific comment\n- Use the search bar at top to find subreddits or posts\n- r/subredditname format for community names"
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"domain": "linkedin.com",
|
|
25
|
+
"antiBot": true,
|
|
26
|
+
"skill": "LinkedIn UI patterns:\n\n## Messaging & Connections\n- To message someone: first check if you're connected (1st degree) - if not, send a connection request first\n- Connection request: go to their profile, click 'Connect' button, optionally add a note\n- Once connected, use the 'Message' button on their profile or go to Messaging tab\n- InMail (messaging non-connections) requires Premium subscription\n\n## Easy Apply Forms\n- Contact Info page is pre-filled from LinkedIn profile - don't try to modify, just click Next\n- Modal forms may need scrolling to see all content and buttons\n- Use screenshots over read_page for modals - accessibility tree often misses modal content\n\n## Navigation\n- Main tabs: Home (feed), My Network, Jobs, Messaging, Notifications\n- Job search: Jobs tab → filter by location, experience level, date posted\n- 'Easy Apply' = apply within LinkedIn; 'Apply' = external site\n- Profile sections are collapsible - click 'Show all' to expand"
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
"domain": "indeed.com",
|
|
30
|
+
"skill": "Indeed best practices:\n- Search for jobs using the 'What' and 'Where' fields at the top\n- Filter results by date posted, salary, job type, experience level\n- Click job title to view full description\n- 'Apply now' or 'Apply on company site' buttons are typically on the right panel\n- Sign in to save jobs and track applications"
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"domain": "calendar.google.com",
|
|
34
|
+
"skill": "Google Calendar best practices:\n- Click on a time slot to create a new event\n- Drag events to reschedule them\n- Click on an event to view details, edit, or delete\n- Use the mini calendar on the left to navigate to different dates\n- Keyboard: 'c' to create event, 't' to go to today, arrow keys to navigate"
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"domain": "drive.google.com",
|
|
38
|
+
"skill": "Google Drive best practices:\n- Double-click files to open them, single-click to select\n- Right-click for context menu (download, share, rename, etc.)\n- Use the search bar to find files by name or content\n- Create new items with the '+ New' button on the left\n- Drag and drop to move files between folders"
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"domain": "notion.so",
|
|
42
|
+
"skill": "Notion best practices:\n- Click to place cursor, type '/' to open command menu\n- Drag blocks using the ⋮⋮ handle on the left\n- Use sidebar for navigation between pages\n- Toggle blocks expand/collapse on click\n- Databases can be viewed as table, board, calendar, etc."
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
"domain": "figma.com",
|
|
46
|
+
"skill": "Figma best practices:\n- This is a canvas-based design tool - always use screenshots to see content\n- Use 'V' for select tool, 'R' for rectangle, 'T' for text\n- Zoom with Cmd/Ctrl+scroll or Cmd/Ctrl++ and Cmd/Ctrl+-\n- Navigate frames in the left sidebar\n- Right-click for context menus and additional options"
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
"domain": "slack.com",
|
|
50
|
+
"skill": "Slack best practices:\n- Channels listed in left sidebar - click to switch\n- Cmd/Ctrl+K to quickly switch channels/DMs\n- @ mentions notify users, # references channels\n- Thread replies keep conversations organized\n- Use the search bar to find messages, files, and people"
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"domain": "twitter.com",
|
|
54
|
+
"antiBot": true,
|
|
55
|
+
"skill": "See x.com — twitter.com redirects to x.com."
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"domain": "x.com",
|
|
59
|
+
"antiBot": true,
|
|
60
|
+
"skill": "X/Twitter — verified patterns (updated 2026-03-30)\n\n## Reading pages (CRITICAL)\n- X loads content asynchronously — page looks empty for 3-5 seconds after navigation.\n- read_page often returns ONLY \"To view keyboard shortcuts\" — tweets haven't loaded yet.\n- DO NOT re-navigate to the same URL. That resets loading and makes it worse.\n- Instead: wait 5 seconds, then use get_page_text — it reads visible text and is more reliable.\n- If get_page_text returns nothing, scroll down once and try again.\n\n## Search\n- URL: x.com/search?q={encoded_query}&src=typed_query&f=live\n- After navigating, wait 5 seconds, then get_page_text (NOT read_page).\n- Scroll down once to load more tweets, then get_page_text again.\n- Tweet URLs in page text follow pattern: /status/{id}\n\n## Text input (CRITICAL — Draft.js)\n- form_input DOES NOT WORK — Draft.js ignores programmatic input.\n- computer type action GARBLES TEXT.\n- ONLY RELIABLE METHOD — use javascript_tool:\n document.querySelector('[data-testid=\"tweetTextarea_0\"]').focus();\n document.execCommand('insertText', false, 'your reply text here');\n- Always verify text appeared by reading after insertion.\n\n## Replying to a tweet\n1. Navigate to tweet URL (x.com/{handle}/status/{id})\n2. Wait 3 seconds, read the page\n3. Click the reply/comment icon (speech bubble) in the action bar\n4. Use javascript_tool to insert text (see above)\n5. Verify text appeared, then click blue \"Reply\" button\n6. Wait 2 seconds to confirm reply posted\n\n## Known traps\n- DO NOT scroll looking for \"Post your reply\" — reply box appears after clicking comment icon\n- x.com/compose/post may open — that's fine, type and click Reply there\n- \"Leave site?\" dialog — ALWAYS click Cancel, finish posting first\n- Reply button is disabled until text is entered — verify first\n- Space replies 15+ seconds apart (rate limiting)\n- NEVER navigate to the same URL you're already on"
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"domain": "amazon.com",
|
|
64
|
+
"skill": "Amazon best practices:\n- Use the search bar at the top for product search\n- Filter results using the left sidebar (price, ratings, Prime, etc.)\n- Click 'Add to Cart' or 'Buy Now' to purchase\n- Product details and reviews are on the product page\n- Check seller information and shipping times before purchasing"
|
|
65
|
+
}
|
|
66
|
+
]
|
package/dist/agent/loop.d.ts
CHANGED
|
@@ -48,6 +48,16 @@ export interface StepUpdate {
|
|
|
48
48
|
toolInput?: Record<string, any>;
|
|
49
49
|
text?: string;
|
|
50
50
|
}
|
|
51
|
+
export interface TurnLog {
|
|
52
|
+
step: number;
|
|
53
|
+
tools: Array<{
|
|
54
|
+
name: string;
|
|
55
|
+
input: Record<string, any>;
|
|
56
|
+
result: string;
|
|
57
|
+
durationMs: number;
|
|
58
|
+
}>;
|
|
59
|
+
ai_response: string | null;
|
|
60
|
+
}
|
|
51
61
|
export interface AgentLoopResult {
|
|
52
62
|
status: "complete" | "error" | "max_steps";
|
|
53
63
|
answer: string;
|
|
@@ -59,5 +69,7 @@ export interface AgentLoopResult {
|
|
|
59
69
|
};
|
|
60
70
|
/** The model used for the last LLM call (for billing attribution) */
|
|
61
71
|
model?: string;
|
|
72
|
+
/** Structured turn-by-turn log of the agent's actions */
|
|
73
|
+
turns?: TurnLog[];
|
|
62
74
|
}
|
|
63
75
|
export declare function runAgentLoop(params: AgentLoopParams): Promise<AgentLoopResult>;
|
package/dist/agent/loop.js
CHANGED
|
@@ -19,9 +19,12 @@ import { buildSystemPrompt } from "./system-prompt.js";
|
|
|
19
19
|
// --- Agent Loop ---
|
|
20
20
|
export async function runAgentLoop(params) {
|
|
21
21
|
const { task, url, context, executeTool, onStep, onText, maxSteps = 50, signal, } = params;
|
|
22
|
-
|
|
22
|
+
// Detect target URL for domain knowledge — from explicit url param or from task text
|
|
23
|
+
const targetUrl = url || task.match(/https?:\/\/[^\s"')]+/)?.[0];
|
|
24
|
+
const system = buildSystemPrompt(targetUrl);
|
|
23
25
|
const tools = AGENT_TOOLS;
|
|
24
26
|
const messages = [];
|
|
27
|
+
const turns = [];
|
|
25
28
|
let totalUsage = { inputTokens: 0, outputTokens: 0, apiCalls: 0 };
|
|
26
29
|
let lastModel;
|
|
27
30
|
// Build initial user message
|
|
@@ -41,6 +44,7 @@ export async function runAgentLoop(params) {
|
|
|
41
44
|
steps: step - 1,
|
|
42
45
|
usage: totalUsage,
|
|
43
46
|
model: lastModel,
|
|
47
|
+
turns,
|
|
44
48
|
};
|
|
45
49
|
}
|
|
46
50
|
onStep?.({ step, status: "thinking" });
|
|
@@ -63,6 +67,7 @@ export async function runAgentLoop(params) {
|
|
|
63
67
|
steps: step,
|
|
64
68
|
usage: totalUsage,
|
|
65
69
|
model: lastModel,
|
|
70
|
+
turns,
|
|
66
71
|
};
|
|
67
72
|
}
|
|
68
73
|
totalUsage.apiCalls++;
|
|
@@ -79,9 +84,17 @@ export async function runAgentLoop(params) {
|
|
|
79
84
|
// Extract text and tool calls
|
|
80
85
|
const textBlocks = response.content.filter((b) => b.type === "text");
|
|
81
86
|
const toolUseBlocks = response.content.filter((b) => b.type === "tool_use");
|
|
87
|
+
// Start building the turn log for this step
|
|
88
|
+
const currentTurn = {
|
|
89
|
+
step,
|
|
90
|
+
tools: [],
|
|
91
|
+
ai_response: textBlocks.map((b) => b.text).join("\n").trim() || null,
|
|
92
|
+
};
|
|
82
93
|
// If no tool calls, we're done
|
|
83
94
|
if (response.stop_reason === "end_turn" || toolUseBlocks.length === 0) {
|
|
84
95
|
const answer = textBlocks.map((b) => b.text).join("\n").trim();
|
|
96
|
+
turns.push(currentTurn);
|
|
97
|
+
console.error(`[AgentLoop] Complete at step ${step} (${totalUsage.apiCalls} API calls, ${totalUsage.inputTokens} input tokens)`);
|
|
85
98
|
onStep?.({ step, status: "complete", text: answer });
|
|
86
99
|
return {
|
|
87
100
|
status: "complete",
|
|
@@ -89,6 +102,7 @@ export async function runAgentLoop(params) {
|
|
|
89
102
|
steps: step,
|
|
90
103
|
usage: totalUsage,
|
|
91
104
|
model: lastModel,
|
|
105
|
+
turns,
|
|
92
106
|
};
|
|
93
107
|
}
|
|
94
108
|
// Execute each tool call
|
|
@@ -105,6 +119,12 @@ export async function runAgentLoop(params) {
|
|
|
105
119
|
});
|
|
106
120
|
continue;
|
|
107
121
|
}
|
|
122
|
+
// Log tool call
|
|
123
|
+
const inputSummary = toolUse.name === "navigate" ? toolUse.input.url
|
|
124
|
+
: toolUse.name === "computer" ? `${toolUse.input.action}${toolUse.input.ref ? ` ref=${toolUse.input.ref}` : ""}${toolUse.input.coordinate ? ` @${toolUse.input.coordinate}` : ""}`
|
|
125
|
+
: toolUse.name === "javascript_tool" ? toolUse.input.text?.slice(0, 80)
|
|
126
|
+
: JSON.stringify(toolUse.input).slice(0, 80);
|
|
127
|
+
console.error(`[AgentLoop] Step ${step}: ${toolUse.name}(${inputSummary})`);
|
|
108
128
|
onStep?.({
|
|
109
129
|
step,
|
|
110
130
|
status: "tool_use",
|
|
@@ -112,6 +132,7 @@ export async function runAgentLoop(params) {
|
|
|
112
132
|
toolInput: toolUse.input,
|
|
113
133
|
});
|
|
114
134
|
let result;
|
|
135
|
+
const toolStartMs = Date.now();
|
|
115
136
|
try {
|
|
116
137
|
result = await executeTool(toolUse.name, toolUse.input);
|
|
117
138
|
}
|
|
@@ -133,15 +154,32 @@ export async function runAgentLoop(params) {
|
|
|
133
154
|
result = { success: false, error: err.message };
|
|
134
155
|
}
|
|
135
156
|
}
|
|
157
|
+
// Log result summary
|
|
158
|
+
const toolDurationMs = Date.now() - toolStartMs;
|
|
159
|
+
const resultText = result.error ? `Error: ${result.error}`
|
|
160
|
+
: typeof result.output === "string" ? result.output
|
|
161
|
+
: JSON.stringify(result.output);
|
|
162
|
+
const resultSummary = resultText.length > 120 ? resultText.slice(0, 120) + "..." : resultText;
|
|
163
|
+
console.error(`[AgentLoop] Step ${step}: ${toolUse.name} → ${resultSummary}`);
|
|
164
|
+
// Add to structured turn log (truncate large results to keep log manageable)
|
|
165
|
+
currentTurn.tools.push({
|
|
166
|
+
name: toolUse.name,
|
|
167
|
+
input: toolUse.input,
|
|
168
|
+
result: (resultText.length > 5000 ? resultText.slice(0, 5000) + "... [truncated]" : resultText)
|
|
169
|
+
+ (result.screenshot ? " [+screenshot]" : ""),
|
|
170
|
+
durationMs: toolDurationMs,
|
|
171
|
+
});
|
|
136
172
|
onStep?.({ step, status: "tool_result", toolName: toolUse.name });
|
|
137
173
|
// Check abort after each tool — don't feed results back to LLM if cancelled
|
|
138
174
|
if (signal?.aborted) {
|
|
175
|
+
turns.push(currentTurn);
|
|
139
176
|
return {
|
|
140
177
|
status: "error",
|
|
141
178
|
answer: "Task was cancelled.",
|
|
142
179
|
steps: step,
|
|
143
180
|
usage: totalUsage,
|
|
144
181
|
model: lastModel,
|
|
182
|
+
turns,
|
|
145
183
|
};
|
|
146
184
|
}
|
|
147
185
|
// Build tool result content block
|
|
@@ -172,6 +210,7 @@ export async function runAgentLoop(params) {
|
|
|
172
210
|
}
|
|
173
211
|
// Add tool results as user message
|
|
174
212
|
messages.push({ role: "user", content: toolResults });
|
|
213
|
+
turns.push(currentTurn);
|
|
175
214
|
}
|
|
176
215
|
// Exceeded max steps
|
|
177
216
|
const lastText = messages
|
|
@@ -186,5 +225,6 @@ export async function runAgentLoop(params) {
|
|
|
186
225
|
steps: maxSteps,
|
|
187
226
|
usage: totalUsage,
|
|
188
227
|
model: lastModel,
|
|
228
|
+
turns,
|
|
189
229
|
};
|
|
190
230
|
}
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* System prompt for server-side managed agent loop.
|
|
3
3
|
*/
|
|
4
|
-
|
|
4
|
+
import { getDomainSkill } from "./domain-knowledge.js";
|
|
5
|
+
export function buildSystemPrompt(taskUrl) {
|
|
5
6
|
const now = new Date();
|
|
6
7
|
const dateStr = now.toLocaleDateString("en-US", {
|
|
7
8
|
month: "numeric",
|
|
@@ -9,7 +10,7 @@ export function buildSystemPrompt() {
|
|
|
9
10
|
year: "numeric",
|
|
10
11
|
});
|
|
11
12
|
const timeStr = now.toLocaleTimeString("en-US");
|
|
12
|
-
|
|
13
|
+
const blocks = [
|
|
13
14
|
{
|
|
14
15
|
type: "text",
|
|
15
16
|
text: `You are a web automation assistant with browser tools. Your priority is to complete the user's request efficiently and autonomously.
|
|
@@ -38,4 +39,13 @@ When a page shows only a loading spinner, use the computer tool with action "wai
|
|
|
38
39
|
</tool_usage_requirements>`,
|
|
39
40
|
},
|
|
40
41
|
];
|
|
42
|
+
// Inject domain-specific knowledge if the task targets a known site
|
|
43
|
+
const domainSkill = taskUrl ? getDomainSkill(taskUrl) : null;
|
|
44
|
+
if (domainSkill) {
|
|
45
|
+
blocks.push({
|
|
46
|
+
type: "text",
|
|
47
|
+
text: `<domain_knowledge domain="${domainSkill.domain}">\n${domainSkill.skill}\n</domain_knowledge>`,
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
return blocks;
|
|
41
51
|
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { SessionFileStatus } from './session-files.js';
|
|
2
|
+
export declare function buildTaskCompletePayload(sessionId: string, result: unknown): {
|
|
3
|
+
session_id: string;
|
|
4
|
+
status: string;
|
|
5
|
+
result: unknown;
|
|
6
|
+
};
|
|
7
|
+
export declare function buildTaskErrorPayload(sessionId: string, error: string): {
|
|
8
|
+
session_id: string;
|
|
9
|
+
status: string;
|
|
10
|
+
error: string;
|
|
11
|
+
};
|
|
12
|
+
export declare function buildStatusPayload(status: SessionFileStatus | SessionFileStatus[]): SessionFileStatus | SessionFileStatus[];
|
|
13
|
+
export declare function buildStopPayload(sessionId: string, remove?: boolean): {
|
|
14
|
+
session_id: string;
|
|
15
|
+
status: string;
|
|
16
|
+
removed: boolean;
|
|
17
|
+
};
|
|
18
|
+
export declare function buildScreenshotPayload(sessionId: string, screenshotPath: string): {
|
|
19
|
+
session_id: string;
|
|
20
|
+
screenshot_path: string;
|
|
21
|
+
};
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
export function buildTaskCompletePayload(sessionId, result) {
|
|
2
|
+
return {
|
|
3
|
+
session_id: sessionId,
|
|
4
|
+
status: 'completed',
|
|
5
|
+
result,
|
|
6
|
+
};
|
|
7
|
+
}
|
|
8
|
+
export function buildTaskErrorPayload(sessionId, error) {
|
|
9
|
+
return {
|
|
10
|
+
session_id: sessionId,
|
|
11
|
+
status: 'error',
|
|
12
|
+
error,
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
export function buildStatusPayload(status) {
|
|
16
|
+
return status;
|
|
17
|
+
}
|
|
18
|
+
export function buildStopPayload(sessionId, remove = false) {
|
|
19
|
+
return {
|
|
20
|
+
session_id: sessionId,
|
|
21
|
+
status: 'stopped',
|
|
22
|
+
removed: remove,
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
export function buildScreenshotPayload(sessionId, screenshotPath) {
|
|
26
|
+
return {
|
|
27
|
+
session_id: sessionId,
|
|
28
|
+
screenshot_path: screenshotPath,
|
|
29
|
+
};
|
|
30
|
+
}
|
package/dist/cli/setup.d.ts
CHANGED
|
@@ -4,7 +4,58 @@
|
|
|
4
4
|
* Scans the machine for Claude Code, Cursor, Windsurf, and Claude Desktop,
|
|
5
5
|
* then merges the Hanzi MCP server entry into each agent's config file.
|
|
6
6
|
*/
|
|
7
|
+
interface AgentConfig {
|
|
8
|
+
name: string;
|
|
9
|
+
slug: string;
|
|
10
|
+
method: 'json-merge' | 'cli-command';
|
|
11
|
+
detect: () => boolean;
|
|
12
|
+
configPath?: () => string;
|
|
13
|
+
cliCommand?: string;
|
|
14
|
+
skillsDir?: () => string;
|
|
15
|
+
}
|
|
16
|
+
interface SetupResult {
|
|
17
|
+
agent: string;
|
|
18
|
+
status: 'configured' | 'already-configured' | 'skipped' | 'error';
|
|
19
|
+
detail: string;
|
|
20
|
+
}
|
|
21
|
+
interface AgentRegistryDeps {
|
|
22
|
+
home?: string;
|
|
23
|
+
plat?: NodeJS.Platform;
|
|
24
|
+
appData?: string;
|
|
25
|
+
pathExists?: (path: string) => boolean;
|
|
26
|
+
runCommand?: (command: string, options?: any) => Buffer | string;
|
|
27
|
+
}
|
|
28
|
+
interface JsonConfigDeps {
|
|
29
|
+
pathExists?: (path: string) => boolean;
|
|
30
|
+
readTextFile?: (path: string, encoding: BufferEncoding) => string;
|
|
31
|
+
writeTextFile?: (path: string, contents: string) => void;
|
|
32
|
+
ensureDir?: (path: string, options: {
|
|
33
|
+
recursive: boolean;
|
|
34
|
+
}) => void;
|
|
35
|
+
copyFile?: (source: string, destination: string) => void;
|
|
36
|
+
}
|
|
37
|
+
interface BrowserDetectionDeps {
|
|
38
|
+
plat?: NodeJS.Platform;
|
|
39
|
+
pathExists?: (path: string) => boolean;
|
|
40
|
+
runCommand?: (command: string, options?: any) => Buffer | string;
|
|
41
|
+
}
|
|
42
|
+
export declare function getAgentRegistry(deps?: AgentRegistryDeps): AgentConfig[];
|
|
43
|
+
export declare function mergeJsonConfig(configPath: string, deps?: JsonConfigDeps): SetupResult;
|
|
44
|
+
interface BrowserInfo {
|
|
45
|
+
name: string;
|
|
46
|
+
slug: string;
|
|
47
|
+
macApp: string;
|
|
48
|
+
linuxBin: string;
|
|
49
|
+
winPaths: string[];
|
|
50
|
+
}
|
|
51
|
+
export declare function detectBrowsers(deps?: BrowserDetectionDeps): BrowserInfo[];
|
|
52
|
+
export declare function resolveInteractiveMode(options?: {
|
|
53
|
+
yes?: boolean;
|
|
54
|
+
}, stdinIsTTY?: boolean): boolean;
|
|
55
|
+
export declare function buildBrowserOpenCommand(browser: BrowserInfo, url: string, plat: NodeJS.Platform): string;
|
|
56
|
+
export declare function buildSystemOpenCommand(url: string, plat: NodeJS.Platform): string;
|
|
7
57
|
export declare function runSetup(options?: {
|
|
8
58
|
only?: string;
|
|
9
59
|
yes?: boolean;
|
|
10
60
|
}): Promise<void>;
|
|
61
|
+
export {};
|