hanzi-browse 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +182 -0
- package/dist/agent/loop.d.ts +63 -0
- package/dist/agent/loop.js +186 -0
- package/dist/agent/system-prompt.d.ts +7 -0
- package/dist/agent/system-prompt.js +41 -0
- package/dist/agent/tools.d.ts +9 -0
- package/dist/agent/tools.js +154 -0
- package/dist/cli/detect-credentials.d.ts +31 -0
- package/dist/cli/detect-credentials.js +44 -0
- package/dist/cli/import-credentials-handler.d.ts +14 -0
- package/dist/cli/import-credentials-handler.js +22 -0
- package/dist/cli/session-files.d.ts +28 -0
- package/dist/cli/session-files.js +118 -0
- package/dist/cli/setup.d.ts +10 -0
- package/dist/cli/setup.js +915 -0
- package/dist/cli.d.ts +16 -0
- package/dist/cli.js +506 -0
- package/dist/dashboard/assets/index-CEFyesbT.js +46 -0
- package/dist/dashboard/assets/index-Dnht2kLU.css +1 -0
- package/dist/dashboard/index.html +13 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +1116 -0
- package/dist/ipc/index.d.ts +8 -0
- package/dist/ipc/index.js +8 -0
- package/dist/ipc/native-host.d.ts +96 -0
- package/dist/ipc/native-host.js +223 -0
- package/dist/ipc/websocket-client.d.ts +73 -0
- package/dist/ipc/websocket-client.js +199 -0
- package/dist/license/manager.d.ts +20 -0
- package/dist/license/manager.js +15 -0
- package/dist/llm/client.d.ts +72 -0
- package/dist/llm/client.js +227 -0
- package/dist/llm/credentials.d.ts +61 -0
- package/dist/llm/credentials.js +200 -0
- package/dist/llm/vertex.d.ts +22 -0
- package/dist/llm/vertex.js +335 -0
- package/dist/managed/api-http.test.d.ts +7 -0
- package/dist/managed/api-http.test.js +623 -0
- package/dist/managed/api.d.ts +51 -0
- package/dist/managed/api.js +1448 -0
- package/dist/managed/api.test.d.ts +10 -0
- package/dist/managed/api.test.js +146 -0
- package/dist/managed/auth.d.ts +38 -0
- package/dist/managed/auth.js +192 -0
- package/dist/managed/billing.d.ts +70 -0
- package/dist/managed/billing.js +227 -0
- package/dist/managed/deploy.d.ts +17 -0
- package/dist/managed/deploy.js +385 -0
- package/dist/managed/e2e.test.d.ts +15 -0
- package/dist/managed/e2e.test.js +151 -0
- package/dist/managed/hardening.test.d.ts +14 -0
- package/dist/managed/hardening.test.js +346 -0
- package/dist/managed/integration.test.d.ts +8 -0
- package/dist/managed/integration.test.js +274 -0
- package/dist/managed/log.d.ts +18 -0
- package/dist/managed/log.js +31 -0
- package/dist/managed/server.d.ts +12 -0
- package/dist/managed/server.js +69 -0
- package/dist/managed/store-pg.d.ts +191 -0
- package/dist/managed/store-pg.js +479 -0
- package/dist/managed/store.d.ts +188 -0
- package/dist/managed/store.js +379 -0
- package/dist/relay/auto-start.d.ts +19 -0
- package/dist/relay/auto-start.js +71 -0
- package/dist/relay/server.d.ts +17 -0
- package/dist/relay/server.js +403 -0
- package/dist/types/index.d.ts +5 -0
- package/dist/types/index.js +4 -0
- package/dist/types/session.d.ts +134 -0
- package/dist/types/session.js +16 -0
- package/package.json +61 -0
- package/skills/README.md +48 -0
- package/skills/a11y-auditor/SKILL.md +42 -0
- package/skills/e2e-tester/SKILL.md +154 -0
- package/skills/hanzi-browse/SKILL.md +182 -0
- package/skills/linkedin-prospector/SKILL.md +149 -0
- package/skills/social-poster/SKILL.md +146 -0
- package/skills/x-marketer/SKILL.md +479 -0
package/README.md
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
# Hanzi Browse — MCP Server
|
|
2
|
+
|
|
3
|
+
The MCP server exposes browser tools to MCP clients and forwards browser work to
|
|
4
|
+
the Chrome extension over the local WebSocket relay.
|
|
5
|
+
|
|
6
|
+
## Setup
|
|
7
|
+
|
|
8
|
+
```bash
|
|
9
|
+
cd mcp-server
|
|
10
|
+
npm install
|
|
11
|
+
npm run build
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
Add to your MCP config (e.g., `~/.claude/claude_desktop_config.json`):
|
|
15
|
+
|
|
16
|
+
```json
|
|
17
|
+
{
|
|
18
|
+
"mcpServers": {
|
|
19
|
+
"browser": {
|
|
20
|
+
"command": "node",
|
|
21
|
+
"args": ["/path/to/hanzi-browse/mcp-server/dist/index.js"]
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
**Prerequisites:** The Chrome extension must be installed and running. See the [main README](../README.md) for full setup.
|
|
28
|
+
|
|
29
|
+
## How It Works
|
|
30
|
+
|
|
31
|
+
```text
|
|
32
|
+
MCP client
|
|
33
|
+
-> mcp-server (stdio)
|
|
34
|
+
-> relay (WebSocket)
|
|
35
|
+
-> Chrome extension
|
|
36
|
+
-> browser agent
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
The extension is the browser executor. The MCP server should only manage MCP
|
|
40
|
+
tool calls, local session bookkeeping, and blocking waits for completion.
|
|
41
|
+
|
|
42
|
+
## Tools
|
|
43
|
+
|
|
44
|
+
### `browser_start`
|
|
45
|
+
|
|
46
|
+
Start a browser task. **Blocks until complete or timeout**.
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
browser_start(
|
|
50
|
+
task: "Search for flights to Tokyo on Google Flights",
|
|
51
|
+
url: "https://flights.google.com", // optional starting URL
|
|
52
|
+
context: "Departing March 15, economy" // optional extra info
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
→ {
|
|
56
|
+
"session_id": "abc123",
|
|
57
|
+
"status": "complete",
|
|
58
|
+
"task": "Search for flights to Tokyo...",
|
|
59
|
+
"answer": "Found 3 flights: JAL $850, ANA $920, United $780",
|
|
60
|
+
"total_steps": 8,
|
|
61
|
+
"recent_steps": ["Opened Google Flights", "Set destination to Tokyo", ...]
|
|
62
|
+
}
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### `browser_message`
|
|
66
|
+
|
|
67
|
+
Send follow-up instructions to an existing session. Also blocks until the agent finishes.
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
browser_message(session_id: "abc123", message: "Book the cheapest one")
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### `browser_status`
|
|
74
|
+
|
|
75
|
+
Check known sessions and their latest status.
|
|
76
|
+
|
|
77
|
+
```
|
|
78
|
+
browser_status() // all active sessions
|
|
79
|
+
browser_status(session_id: "abc123") // specific session
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### `browser_stop`
|
|
83
|
+
|
|
84
|
+
Stop a task.
|
|
85
|
+
|
|
86
|
+
```
|
|
87
|
+
browser_stop(session_id: "abc123")
|
|
88
|
+
browser_stop(session_id: "abc123", remove: true) // also delete session
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### `browser_screenshot`
|
|
92
|
+
|
|
93
|
+
Capture the current browser state as an image.
|
|
94
|
+
|
|
95
|
+
```
|
|
96
|
+
browser_screenshot(session_id: "abc123")
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Examples
|
|
100
|
+
|
|
101
|
+
**Research:**
|
|
102
|
+
```
|
|
103
|
+
browser_start("Find the top 3 competitors for Acme Corp and summarize their pricing")
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Logged-in workflows:**
|
|
107
|
+
```
|
|
108
|
+
browser_start("Go to Jira, find my open tickets, and summarize what needs attention this week")
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
**Multi-turn:**
|
|
112
|
+
```
|
|
113
|
+
s = browser_start("Go to LinkedIn and find AI Engineer jobs in Montreal")
|
|
114
|
+
→ { session_id: "x1", answer: "Found: Applied AI Engineer at Cohere" }
|
|
115
|
+
|
|
116
|
+
browser_message("x1", "Click into that job and tell me the requirements")
|
|
117
|
+
→ { answer: "Requirements: 3+ years Python, ML experience..." }
|
|
118
|
+
|
|
119
|
+
browser_message("x1", "Apply to this job using my profile")
|
|
120
|
+
→ { answer: "Application submitted successfully" }
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
**Parallel execution:**
|
|
124
|
+
```
|
|
125
|
+
browser_start("Check flight prices to Tokyo")
|
|
126
|
+
browser_start("Check hotel prices in Shibuya")
|
|
127
|
+
browser_start("Look up train pass costs")
|
|
128
|
+
// All three run simultaneously
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Configuration
|
|
132
|
+
|
|
133
|
+
| Environment Variable | Default | Description |
|
|
134
|
+
|---|---|---|
|
|
135
|
+
| `HANZI_IN_CHROME_MAX_SESSIONS` | `5` | Max concurrent browser tasks |
|
|
136
|
+
| `WS_RELAY_PORT` | `7862` | WebSocket relay port |
|
|
137
|
+
|
|
138
|
+
## Architecture
|
|
139
|
+
|
|
140
|
+
```
|
|
141
|
+
AI Tool (Claude Code, Cursor, etc.)
|
|
142
|
+
↓ MCP Protocol (stdio)
|
|
143
|
+
MCP Server
|
|
144
|
+
↓ WebSocket
|
|
145
|
+
Relay Server
|
|
146
|
+
↓ WebSocket
|
|
147
|
+
Chrome Extension
|
|
148
|
+
↓ Extension agent loop
|
|
149
|
+
Target Website
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
The relay server starts automatically when the MCP server connects. It routes
|
|
153
|
+
messages between the MCP server and the Chrome extension and briefly queues
|
|
154
|
+
messages while the extension service worker is asleep.
|
|
155
|
+
|
|
156
|
+
> **Principle**: Hanzi is for real browser work in your signed-in Chrome.
|
|
157
|
+
> Agents should prefer code, logs, APIs, and existing tools first. Use Hanzi when the job needs a real browser session.
|
|
158
|
+
|
|
159
|
+
## Prompts
|
|
160
|
+
|
|
161
|
+
The server exposes MCP prompts that clients auto-discover as slash commands:
|
|
162
|
+
|
|
163
|
+
| Prompt | Description |
|
|
164
|
+
|--------|-------------|
|
|
165
|
+
| `linkedin-prospector` | Goal-driven LinkedIn outreach — networking, sales, partnerships, or hiring |
|
|
166
|
+
| `e2e-tester` | Test your app in a real browser — reports bugs with screenshots and code references |
|
|
167
|
+
| `social-poster` | Post across LinkedIn, Twitter, Reddit, HN — drafts per-platform, posts from your browser |
|
|
168
|
+
|
|
169
|
+
In Claude Code, use the built-in `linkedin-prospector` prompt from the MCP prompt list.
|
|
170
|
+
|
|
171
|
+
## Skills CLI
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
hanzi-browser skills # list available skills
|
|
175
|
+
hanzi-browser skills install linkedin-prospector # install SKILL.md to your project
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
Skills are portable SKILL.md files for agents that don't support MCP prompts (Cline, Codex). Each skill follows the same principle: use existing tools first, Hanzi only for real browser steps.
|
|
179
|
+
|
|
180
|
+
## License
|
|
181
|
+
|
|
182
|
+
[Polyform Noncommercial 1.0.0](../LICENSE)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Server-Side Managed Agent Loop
|
|
3
|
+
*
|
|
4
|
+
* Drives the browser automation agent from the server:
|
|
5
|
+
* 1. Receives a task
|
|
6
|
+
* 2. Calls Vertex AI (via callLLM) with system prompt + tools
|
|
7
|
+
* 3. For each tool_use: sends execution request to extension via WebSocket relay
|
|
8
|
+
* 4. Gets tool results back from extension
|
|
9
|
+
* 5. Feeds results back to Vertex AI
|
|
10
|
+
* 6. Repeats until end_turn or max steps
|
|
11
|
+
* 7. Returns the final answer
|
|
12
|
+
*
|
|
13
|
+
* The extension is a dumb tool executor — it only runs tools and returns results.
|
|
14
|
+
* All intelligence lives here.
|
|
15
|
+
*/
|
|
16
|
+
export interface AgentLoopParams {
|
|
17
|
+
/** The task description */
|
|
18
|
+
task: string;
|
|
19
|
+
/** Optional starting URL */
|
|
20
|
+
url?: string;
|
|
21
|
+
/** Optional context (form data, preferences, etc.) */
|
|
22
|
+
context?: string;
|
|
23
|
+
/** Function to execute a tool on the extension. Returns the tool result. */
|
|
24
|
+
executeTool: (toolName: string, toolInput: Record<string, any>) => Promise<ToolResult>;
|
|
25
|
+
/** Optional callback for step updates */
|
|
26
|
+
onStep?: (step: StepUpdate) => void;
|
|
27
|
+
/** Optional callback for streaming text */
|
|
28
|
+
onText?: (chunk: string) => void;
|
|
29
|
+
/** Max agent loop iterations (default: 50) */
|
|
30
|
+
maxSteps?: number;
|
|
31
|
+
/** Abort signal */
|
|
32
|
+
signal?: AbortSignal;
|
|
33
|
+
}
|
|
34
|
+
export interface ToolResult {
|
|
35
|
+
success: boolean;
|
|
36
|
+
output?: any;
|
|
37
|
+
error?: string;
|
|
38
|
+
/** Base64 screenshot if the tool returned one */
|
|
39
|
+
screenshot?: {
|
|
40
|
+
data: string;
|
|
41
|
+
mediaType: string;
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
export interface StepUpdate {
|
|
45
|
+
step: number;
|
|
46
|
+
status: "thinking" | "tool_use" | "tool_result" | "complete" | "error";
|
|
47
|
+
toolName?: string;
|
|
48
|
+
toolInput?: Record<string, any>;
|
|
49
|
+
text?: string;
|
|
50
|
+
}
|
|
51
|
+
export interface AgentLoopResult {
|
|
52
|
+
status: "complete" | "error" | "max_steps";
|
|
53
|
+
answer: string;
|
|
54
|
+
steps: number;
|
|
55
|
+
usage: {
|
|
56
|
+
inputTokens: number;
|
|
57
|
+
outputTokens: number;
|
|
58
|
+
apiCalls: number;
|
|
59
|
+
};
|
|
60
|
+
/** The model used for the last LLM call (for billing attribution) */
|
|
61
|
+
model?: string;
|
|
62
|
+
}
|
|
63
|
+
export declare function runAgentLoop(params: AgentLoopParams): Promise<AgentLoopResult>;
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Server-Side Managed Agent Loop
|
|
3
|
+
*
|
|
4
|
+
* Drives the browser automation agent from the server:
|
|
5
|
+
* 1. Receives a task
|
|
6
|
+
* 2. Calls Vertex AI (via callLLM) with system prompt + tools
|
|
7
|
+
* 3. For each tool_use: sends execution request to extension via WebSocket relay
|
|
8
|
+
* 4. Gets tool results back from extension
|
|
9
|
+
* 5. Feeds results back to Vertex AI
|
|
10
|
+
* 6. Repeats until end_turn or max steps
|
|
11
|
+
* 7. Returns the final answer
|
|
12
|
+
*
|
|
13
|
+
* The extension is a dumb tool executor — it only runs tools and returns results.
|
|
14
|
+
* All intelligence lives here.
|
|
15
|
+
*/
|
|
16
|
+
import { callLLM } from "../llm/client.js";
|
|
17
|
+
import { AGENT_TOOLS } from "./tools.js";
|
|
18
|
+
import { buildSystemPrompt } from "./system-prompt.js";
|
|
19
|
+
// --- Agent Loop ---
|
|
20
|
+
export async function runAgentLoop(params) {
|
|
21
|
+
const { task, url, context, executeTool, onStep, onText, maxSteps = 50, signal, } = params;
|
|
22
|
+
const system = buildSystemPrompt();
|
|
23
|
+
const tools = AGENT_TOOLS;
|
|
24
|
+
const messages = [];
|
|
25
|
+
let totalUsage = { inputTokens: 0, outputTokens: 0, apiCalls: 0 };
|
|
26
|
+
let lastModel;
|
|
27
|
+
// Build initial user message
|
|
28
|
+
let userMessage = task;
|
|
29
|
+
if (url) {
|
|
30
|
+
userMessage = `Navigate to ${url} first, then: ${task}`;
|
|
31
|
+
}
|
|
32
|
+
if (context) {
|
|
33
|
+
userMessage += `\n\n<context>\n${context}\n</context>`;
|
|
34
|
+
}
|
|
35
|
+
messages.push({ role: "user", content: userMessage });
|
|
36
|
+
for (let step = 1; step <= maxSteps; step++) {
|
|
37
|
+
if (signal?.aborted) {
|
|
38
|
+
return {
|
|
39
|
+
status: "error",
|
|
40
|
+
answer: "Task was cancelled.",
|
|
41
|
+
steps: step - 1,
|
|
42
|
+
usage: totalUsage,
|
|
43
|
+
model: lastModel,
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
onStep?.({ step, status: "thinking" });
|
|
47
|
+
// Call LLM
|
|
48
|
+
let response;
|
|
49
|
+
try {
|
|
50
|
+
response = await callLLM({
|
|
51
|
+
messages,
|
|
52
|
+
system,
|
|
53
|
+
tools,
|
|
54
|
+
signal,
|
|
55
|
+
onText,
|
|
56
|
+
});
|
|
57
|
+
}
|
|
58
|
+
catch (err) {
|
|
59
|
+
console.error(`[AgentLoop] LLM call failed at step ${step}:`, err.message);
|
|
60
|
+
return {
|
|
61
|
+
status: "error",
|
|
62
|
+
answer: `LLM call failed: ${err.message}`,
|
|
63
|
+
steps: step,
|
|
64
|
+
usage: totalUsage,
|
|
65
|
+
model: lastModel,
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
totalUsage.apiCalls++;
|
|
69
|
+
totalUsage.inputTokens += response.usage?.input_tokens || 0;
|
|
70
|
+
totalUsage.outputTokens += response.usage?.output_tokens || 0;
|
|
71
|
+
if (response.model)
|
|
72
|
+
lastModel = response.model;
|
|
73
|
+
// Add assistant response to conversation
|
|
74
|
+
messages.push({ role: "assistant", content: response.content });
|
|
75
|
+
// Extract text and tool calls
|
|
76
|
+
const textBlocks = response.content.filter((b) => b.type === "text");
|
|
77
|
+
const toolUseBlocks = response.content.filter((b) => b.type === "tool_use");
|
|
78
|
+
// If no tool calls, we're done
|
|
79
|
+
if (response.stop_reason === "end_turn" || toolUseBlocks.length === 0) {
|
|
80
|
+
const answer = textBlocks.map((b) => b.text).join("\n").trim();
|
|
81
|
+
onStep?.({ step, status: "complete", text: answer });
|
|
82
|
+
return {
|
|
83
|
+
status: "complete",
|
|
84
|
+
answer: answer || "Task completed.",
|
|
85
|
+
steps: step,
|
|
86
|
+
usage: totalUsage,
|
|
87
|
+
model: lastModel,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
// Execute each tool call
|
|
91
|
+
const allowedToolNames = new Set(tools.map((t) => t.name));
|
|
92
|
+
const toolResults = [];
|
|
93
|
+
for (const toolUse of toolUseBlocks) {
|
|
94
|
+
// Validate tool name against allowed list before forwarding to extension
|
|
95
|
+
if (!allowedToolNames.has(toolUse.name)) {
|
|
96
|
+
console.error(`[AgentLoop] LLM requested unknown tool: ${toolUse.name}`);
|
|
97
|
+
toolResults.push({
|
|
98
|
+
type: "tool_result",
|
|
99
|
+
tool_use_id: toolUse.id,
|
|
100
|
+
content: [{ type: "text", text: `Error: Unknown tool "${toolUse.name}". Available tools: ${[...allowedToolNames].join(", ")}` }],
|
|
101
|
+
});
|
|
102
|
+
continue;
|
|
103
|
+
}
|
|
104
|
+
onStep?.({
|
|
105
|
+
step,
|
|
106
|
+
status: "tool_use",
|
|
107
|
+
toolName: toolUse.name,
|
|
108
|
+
toolInput: toolUse.input,
|
|
109
|
+
});
|
|
110
|
+
let result;
|
|
111
|
+
try {
|
|
112
|
+
result = await executeTool(toolUse.name, toolUse.input);
|
|
113
|
+
}
|
|
114
|
+
catch (err) {
|
|
115
|
+
// Retry once on transient errors (timeouts, relay disconnects)
|
|
116
|
+
const isTransient = err.message?.includes("timed out") ||
|
|
117
|
+
err.message?.includes("not connected") ||
|
|
118
|
+
err.message?.includes("Relay");
|
|
119
|
+
if (isTransient && !signal?.aborted) {
|
|
120
|
+
console.error(`[AgentLoop] Transient error on ${toolUse.name}, retrying once: ${err.message}`);
|
|
121
|
+
try {
|
|
122
|
+
result = await executeTool(toolUse.name, toolUse.input);
|
|
123
|
+
}
|
|
124
|
+
catch (retryErr) {
|
|
125
|
+
result = { success: false, error: `${retryErr.message} (after retry)` };
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
else {
|
|
129
|
+
result = { success: false, error: err.message };
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
onStep?.({ step, status: "tool_result", toolName: toolUse.name });
|
|
133
|
+
// Check abort after each tool — don't feed results back to LLM if cancelled
|
|
134
|
+
if (signal?.aborted) {
|
|
135
|
+
return {
|
|
136
|
+
status: "error",
|
|
137
|
+
answer: "Task was cancelled.",
|
|
138
|
+
steps: step,
|
|
139
|
+
usage: totalUsage,
|
|
140
|
+
model: lastModel,
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
// Build tool result content block
|
|
144
|
+
const resultContent = [];
|
|
145
|
+
// Add text result
|
|
146
|
+
const textOutput = result.error
|
|
147
|
+
? `Error: ${result.error}`
|
|
148
|
+
: typeof result.output === "string"
|
|
149
|
+
? result.output
|
|
150
|
+
: JSON.stringify(result.output);
|
|
151
|
+
resultContent.push({ type: "text", text: textOutput });
|
|
152
|
+
// Add screenshot if present
|
|
153
|
+
if (result.screenshot) {
|
|
154
|
+
resultContent.push({
|
|
155
|
+
type: "image",
|
|
156
|
+
source: {
|
|
157
|
+
type: "base64",
|
|
158
|
+
media_type: result.screenshot.mediaType,
|
|
159
|
+
data: result.screenshot.data,
|
|
160
|
+
},
|
|
161
|
+
});
|
|
162
|
+
}
|
|
163
|
+
toolResults.push({
|
|
164
|
+
type: "tool_result",
|
|
165
|
+
tool_use_id: toolUse.id,
|
|
166
|
+
content: resultContent,
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
// Add tool results as user message
|
|
170
|
+
messages.push({ role: "user", content: toolResults });
|
|
171
|
+
}
|
|
172
|
+
// Exceeded max steps
|
|
173
|
+
const lastText = messages
|
|
174
|
+
.filter((m) => m.role === "assistant")
|
|
175
|
+
.flatMap((m) => Array.isArray(m.content)
|
|
176
|
+
? m.content.filter((b) => b.type === "text").map((b) => b.text)
|
|
177
|
+
: [m.content])
|
|
178
|
+
.pop();
|
|
179
|
+
return {
|
|
180
|
+
status: "max_steps",
|
|
181
|
+
answer: lastText || "Task did not complete within the step limit.",
|
|
182
|
+
steps: maxSteps,
|
|
183
|
+
usage: totalUsage,
|
|
184
|
+
model: lastModel,
|
|
185
|
+
};
|
|
186
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* System prompt for server-side managed agent loop.
|
|
3
|
+
*/
|
|
4
|
+
export function buildSystemPrompt() {
|
|
5
|
+
const now = new Date();
|
|
6
|
+
const dateStr = now.toLocaleDateString("en-US", {
|
|
7
|
+
month: "numeric",
|
|
8
|
+
day: "numeric",
|
|
9
|
+
year: "numeric",
|
|
10
|
+
});
|
|
11
|
+
const timeStr = now.toLocaleTimeString("en-US");
|
|
12
|
+
return [
|
|
13
|
+
{
|
|
14
|
+
type: "text",
|
|
15
|
+
text: `You are a web automation assistant with browser tools. Your priority is to complete the user's request efficiently and autonomously.
|
|
16
|
+
|
|
17
|
+
Browser tasks often require long-running, agentic capabilities. When you encounter a user request that feels time-consuming or extensive in scope, you should be persistent and use all available context needed to accomplish the task. The user expects you to work autonomously until the task is complete. Do not ask for permission - just do it.
|
|
18
|
+
|
|
19
|
+
<behavior_instructions>
|
|
20
|
+
The current date is ${dateStr}, ${timeStr}.
|
|
21
|
+
|
|
22
|
+
Keep responses concise and action-oriented.
|
|
23
|
+
Do not use emojis unless asked.
|
|
24
|
+
Do not introduce yourself. Respond to the user's request directly.
|
|
25
|
+
Do not ask for permission or confirmation. Just complete the task.
|
|
26
|
+
</behavior_instructions>
|
|
27
|
+
|
|
28
|
+
<tool_usage_requirements>
|
|
29
|
+
Use "read_page" first to get a DOM tree with numeric element IDs (backendNodeIds). This allows you to reliably target elements.
|
|
30
|
+
|
|
31
|
+
Use numeric element references from read_page (e.g. "42") with the "left_click" action of the "computer" tool and the "form_input" tool. Only use coordinate-based actions when references fail.
|
|
32
|
+
|
|
33
|
+
Use "get_page_text" or "read_page" to efficiently read content instead of repeatedly scrolling.
|
|
34
|
+
|
|
35
|
+
ALWAYS use form_input for ANY dropdown or select element. Never use computer clicks for dropdowns.
|
|
36
|
+
|
|
37
|
+
When a page shows only a loading spinner, use the computer tool with action "wait" (duration 2-3 seconds) then read_page again.
|
|
38
|
+
</tool_usage_requirements>`,
|
|
39
|
+
},
|
|
40
|
+
];
|
|
41
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tool definitions for server-side managed agent loop.
|
|
3
|
+
*
|
|
4
|
+
* These mirror the extension's tool definitions but are used by the server
|
|
5
|
+
* when driving the agent loop via Vertex AI. The extension receives
|
|
6
|
+
* tool execution requests and returns results.
|
|
7
|
+
*/
|
|
8
|
+
import type { Tool } from "../llm/client.js";
|
|
9
|
+
export declare const AGENT_TOOLS: Tool[];
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tool definitions for server-side managed agent loop.
|
|
3
|
+
*
|
|
4
|
+
* These mirror the extension's tool definitions but are used by the server
|
|
5
|
+
* when driving the agent loop via Vertex AI. The extension receives
|
|
6
|
+
* tool execution requests and returns results.
|
|
7
|
+
*/
|
|
8
|
+
export const AGENT_TOOLS = [
|
|
9
|
+
{
|
|
10
|
+
name: "read_page",
|
|
11
|
+
description: `Get a rich DOM tree of the page via Chrome DevTools Protocol. Returns interactive elements with numeric backendNodeId references (e.g., [42]<button>Submit</button>). IMPORTANT: Only use element IDs from the CURRENT output — IDs change between calls. Pierces shadow DOM and iframes automatically.`,
|
|
12
|
+
input_schema: {
|
|
13
|
+
type: "object",
|
|
14
|
+
properties: {
|
|
15
|
+
max_chars: {
|
|
16
|
+
type: "number",
|
|
17
|
+
description: "Maximum characters for output (default: 50000).",
|
|
18
|
+
},
|
|
19
|
+
},
|
|
20
|
+
required: [],
|
|
21
|
+
},
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
name: "find",
|
|
25
|
+
description: `Find elements on the page using natural language. Can search by purpose (e.g., "search bar", "login button") or text content. Returns up to 20 matching elements with references.`,
|
|
26
|
+
input_schema: {
|
|
27
|
+
type: "object",
|
|
28
|
+
properties: {
|
|
29
|
+
query: {
|
|
30
|
+
type: "string",
|
|
31
|
+
description: 'Natural language description of what to find (e.g., "search bar", "add to cart button")',
|
|
32
|
+
},
|
|
33
|
+
},
|
|
34
|
+
required: ["query"],
|
|
35
|
+
},
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
name: "form_input",
|
|
39
|
+
description: `Set values in ANY form element — text inputs, textareas, dropdowns, checkboxes, radio buttons, date pickers. For dropdowns, just pass the desired option text. ALWAYS prefer form_input over computer clicks for form fields.`,
|
|
40
|
+
input_schema: {
|
|
41
|
+
type: "object",
|
|
42
|
+
properties: {
|
|
43
|
+
ref: {
|
|
44
|
+
type: "string",
|
|
45
|
+
description: 'Element reference from read_page (e.g., "42") or find tool (e.g., "ref_1")',
|
|
46
|
+
},
|
|
47
|
+
value: {
|
|
48
|
+
type: "string",
|
|
49
|
+
description: "The value to set.",
|
|
50
|
+
},
|
|
51
|
+
},
|
|
52
|
+
required: ["ref", "value"],
|
|
53
|
+
},
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
name: "computer",
|
|
57
|
+
description: `Use a mouse and keyboard to interact with a web browser, and take screenshots.
|
|
58
|
+
* Click elements using their ref from read_page or find tools.
|
|
59
|
+
* Take screenshots to see the current page state.
|
|
60
|
+
* Scroll to see more content.`,
|
|
61
|
+
input_schema: {
|
|
62
|
+
type: "object",
|
|
63
|
+
properties: {
|
|
64
|
+
action: {
|
|
65
|
+
type: "string",
|
|
66
|
+
enum: [
|
|
67
|
+
"left_click", "right_click", "type", "screenshot", "wait",
|
|
68
|
+
"scroll", "key", "left_click_drag", "double_click", "triple_click",
|
|
69
|
+
"zoom", "scroll_to", "hover",
|
|
70
|
+
],
|
|
71
|
+
description: "The action to perform.",
|
|
72
|
+
},
|
|
73
|
+
coordinate: {
|
|
74
|
+
type: "array",
|
|
75
|
+
items: { type: "number" },
|
|
76
|
+
description: "(x, y) pixel coordinates for click/scroll actions.",
|
|
77
|
+
},
|
|
78
|
+
text: {
|
|
79
|
+
type: "string",
|
|
80
|
+
description: "Text to type or key(s) to press.",
|
|
81
|
+
},
|
|
82
|
+
duration: {
|
|
83
|
+
type: "number",
|
|
84
|
+
description: "Seconds to wait (for wait action). Max 30.",
|
|
85
|
+
},
|
|
86
|
+
scroll_direction: {
|
|
87
|
+
type: "string",
|
|
88
|
+
enum: ["up", "down", "left", "right"],
|
|
89
|
+
description: "Direction to scroll.",
|
|
90
|
+
},
|
|
91
|
+
scroll_amount: {
|
|
92
|
+
type: "number",
|
|
93
|
+
description: "Number of scroll ticks (1-10).",
|
|
94
|
+
},
|
|
95
|
+
ref: {
|
|
96
|
+
type: "string",
|
|
97
|
+
description: "Element reference for click/scroll_to actions.",
|
|
98
|
+
},
|
|
99
|
+
region: {
|
|
100
|
+
type: "array",
|
|
101
|
+
items: { type: "number" },
|
|
102
|
+
description: "(x0, y0, x1, y1) region for zoom action.",
|
|
103
|
+
},
|
|
104
|
+
},
|
|
105
|
+
required: ["action"],
|
|
106
|
+
},
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
name: "navigate",
|
|
110
|
+
description: `Navigate to a URL, or go forward/back in browser history.`,
|
|
111
|
+
input_schema: {
|
|
112
|
+
type: "object",
|
|
113
|
+
properties: {
|
|
114
|
+
url: {
|
|
115
|
+
type: "string",
|
|
116
|
+
description: 'The URL to navigate to. Use "forward"/"back" for history navigation.',
|
|
117
|
+
},
|
|
118
|
+
},
|
|
119
|
+
required: ["url"],
|
|
120
|
+
},
|
|
121
|
+
},
|
|
122
|
+
{
|
|
123
|
+
name: "get_page_text",
|
|
124
|
+
description: `Extract raw text content from the page, prioritizing article content. Ideal for reading text-heavy pages.`,
|
|
125
|
+
input_schema: {
|
|
126
|
+
type: "object",
|
|
127
|
+
properties: {
|
|
128
|
+
max_chars: {
|
|
129
|
+
type: "number",
|
|
130
|
+
description: "Maximum characters for output (default: 50000).",
|
|
131
|
+
},
|
|
132
|
+
},
|
|
133
|
+
required: [],
|
|
134
|
+
},
|
|
135
|
+
},
|
|
136
|
+
{
|
|
137
|
+
name: "javascript_tool",
|
|
138
|
+
description: `Execute JavaScript in the page context. Returns the result of the last expression. Do NOT use 'return' — just write the expression.`,
|
|
139
|
+
input_schema: {
|
|
140
|
+
type: "object",
|
|
141
|
+
properties: {
|
|
142
|
+
action: {
|
|
143
|
+
type: "string",
|
|
144
|
+
description: "Must be 'javascript_exec'.",
|
|
145
|
+
},
|
|
146
|
+
text: {
|
|
147
|
+
type: "string",
|
|
148
|
+
description: "JavaScript code to execute.",
|
|
149
|
+
},
|
|
150
|
+
},
|
|
151
|
+
required: ["action", "text"],
|
|
152
|
+
},
|
|
153
|
+
},
|
|
154
|
+
];
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Credential source detection for CLI setup.
|
|
3
|
+
*
|
|
4
|
+
* Claude Code stores OAuth tokens in one of two locations:
|
|
5
|
+
* 1. ~/.claude/.credentials.json (file-based, all platforms)
|
|
6
|
+
* 2. macOS Keychain under "Claude Code-credentials" (macOS only)
|
|
7
|
+
*
|
|
8
|
+
* The original implementation only checked (1), missing most macOS users.
|
|
9
|
+
*/
|
|
10
|
+
export interface CredentialSource {
|
|
11
|
+
name: string;
|
|
12
|
+
slug: 'claude' | 'codex';
|
|
13
|
+
path: string;
|
|
14
|
+
}
|
|
15
|
+
export interface DetectOptions {
|
|
16
|
+
platform: string;
|
|
17
|
+
homedir: string;
|
|
18
|
+
fileExists: (path: string) => boolean;
|
|
19
|
+
keychainHas: (service: string) => boolean;
|
|
20
|
+
}
|
|
21
|
+
export interface CredentialFlowState {
|
|
22
|
+
sourcesDetected: number;
|
|
23
|
+
anyImported: boolean;
|
|
24
|
+
manualEntryChosen: boolean;
|
|
25
|
+
}
|
|
26
|
+
export declare function detectCredentialSources(opts: DetectOptions): CredentialSource[];
|
|
27
|
+
/**
|
|
28
|
+
* Returns an error message if setup finished with no credentials configured,
|
|
29
|
+
* or null if everything is fine.
|
|
30
|
+
*/
|
|
31
|
+
export declare function checkCredentialFlowResult(state: CredentialFlowState): string | null;
|