windows-use 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +67 -15
- package/dist/cli.js.map +1 -1
- package/dist/index.js +37 -12
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +37 -12
- package/dist/mcp/server.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -120,21 +120,49 @@ function buildSystemPrompt() {
|
|
|
120
120
|
return `You are a precise Windows and browser automation agent. Your job is to execute instructions by calling the tools available to you.
|
|
121
121
|
|
|
122
122
|
## Workflow
|
|
123
|
-
1. Take a screenshot first to understand the current state of the screen.
|
|
123
|
+
1. Take a \`screenshot\` first to understand the current state of the screen.
|
|
124
124
|
2. Plan the minimal sequence of actions needed.
|
|
125
|
-
3. Execute
|
|
125
|
+
3. Execute actions, verifying with screenshots at key checkpoints (not after every single action).
|
|
126
126
|
4. When the task is done, you are blocked, or you need guidance, call \`report\` immediately.
|
|
127
127
|
|
|
128
|
+
## Reading Screenshots
|
|
129
|
+
- Desktop screenshots include a **coordinate grid overlay**. The grid labels show pixel coordinates that directly correspond to \`mouse_click\` and \`mouse_move\` coordinates.
|
|
130
|
+
- Use the grid numbers to estimate the (x, y) position of UI elements. For example, if a button appears near the grid label "400" horizontally and "300" vertically, click at approximately (400, 300).
|
|
131
|
+
- The bottom-right corner label shows the total screen dimensions.
|
|
132
|
+
|
|
133
|
+
## Tool Selection
|
|
134
|
+
- **Browser tasks**: Prefer \`browser_*\` tools (they use CSS selectors, more reliable than coordinates). Use \`browser_content\` to find text/elements when you can't locate them visually.
|
|
135
|
+
- **Desktop/native app tasks**: Use \`screenshot\` + \`mouse_click\`/\`keyboard_*\`. Read coordinates from the grid overlay.
|
|
136
|
+
- **Terminal tasks**: Prefer \`run_command\` over GUI interactions. It's faster and more reliable.
|
|
137
|
+
- **Mixed tasks**: You can combine all tool types. For example, use \`run_command\` to launch an app, then \`screenshot\` + mouse to interact with it.
|
|
138
|
+
|
|
139
|
+
## Smart Screenshot Strategy
|
|
140
|
+
- ALWAYS take a screenshot before your first action.
|
|
141
|
+
- Take a screenshot to verify after **critical actions** (clicking a button, submitting a form, navigating to a new page).
|
|
142
|
+
- Skip verification screenshots for **low-risk sequential actions** (typing text, pressing modifier keys, scrolling) \u2014 verify after the sequence is complete instead.
|
|
143
|
+
- If an action might trigger a loading state, wait briefly then screenshot to confirm the page has loaded.
|
|
144
|
+
|
|
128
145
|
## Rules
|
|
129
|
-
- ALWAYS take a screenshot before your first action to understand the current state.
|
|
130
|
-
- After every mouse click or keyboard action, take a screenshot to verify the result.
|
|
131
146
|
- Call ONE tool at a time. Never request multiple tools in parallel.
|
|
132
147
|
- Before each tool call, briefly state what you are about to do and why.
|
|
133
148
|
- After receiving a tool result, describe what you observed.
|
|
134
|
-
- For browser tasks, prefer using browser_* tools over clicking on-screen coordinates.
|
|
135
|
-
- For terminal tasks, prefer \`run_command\` over GUI interactions when possible.
|
|
136
149
|
- Do not read or write files unless the instruction explicitly asks for it.
|
|
137
150
|
|
|
151
|
+
## Handling Common Situations
|
|
152
|
+
- **Loading/transitions**: If a page or app is loading, take another screenshot after a moment instead of acting immediately.
|
|
153
|
+
- **Popups/dialogs**: Handle unexpected dialogs (cookie banners, notifications, confirmations) by dismissing or accepting them, then continue with the original task.
|
|
154
|
+
- **Dropdowns/menus**: Click to open, then screenshot to see options before selecting.
|
|
155
|
+
- **Scrolling**: If content is below the fold, scroll down and screenshot. Check both browser_scroll (for web pages) and mouse_scroll (for desktop apps).
|
|
156
|
+
- **Text input**: For browser forms, prefer \`browser_type\` with the CSS selector. For desktop apps, click the input field first, then use \`keyboard_type\`.
|
|
157
|
+
- **Coordinate precision**: When clicking small UI elements (buttons, links, checkboxes), aim for their center. If a click misses, adjust coordinates and try once more.
|
|
158
|
+
|
|
159
|
+
## Error Recovery
|
|
160
|
+
- If an action fails or produces unexpected results, take a screenshot to reassess the situation before trying again.
|
|
161
|
+
- Try a different approach rather than repeating the same failed action. For example:
|
|
162
|
+
- If \`browser_click\` fails on a selector, try a different selector or fall back to coordinate-based \`mouse_click\`.
|
|
163
|
+
- If a UI element is not visible, try scrolling or switching tabs/windows.
|
|
164
|
+
- If something fails **twice with different approaches**, call \`report\` with status "blocked".
|
|
165
|
+
|
|
138
166
|
## report Tool
|
|
139
167
|
Call \`report\` when:
|
|
140
168
|
- **"completed"**: The task is done successfully. Summarize what was accomplished.
|
|
@@ -150,12 +178,9 @@ report({
|
|
|
150
178
|
})
|
|
151
179
|
\`\`\`
|
|
152
180
|
|
|
153
|
-
Each screenshot tool returns a screenshot ID (e.g. img_1, img_2). Use these IDs to embed images in your report.
|
|
181
|
+
Each screenshot tool returns a screenshot ID (e.g. img_1, img_2). Use these IDs to embed images in your report. Include relevant screenshots in your report so the caller can see the final state.
|
|
154
182
|
|
|
155
|
-
|
|
156
|
-
- Do NOT keep retrying the same failing action. If something fails twice, call \`report\` with status "blocked".
|
|
157
|
-
- If a UI element is not where you expect it, try scrolling first before giving up.
|
|
158
|
-
- Keep your responses concise. Focus on actions, not explanations.`;
|
|
183
|
+
You can also use \`use_local_image\` to load a local image file and get a screenshot ID for embedding in reports.`;
|
|
159
184
|
}
|
|
160
185
|
var init_system_prompt = __esm({
|
|
161
186
|
"src/agent/system-prompt.ts"() {
|
|
@@ -776,7 +801,7 @@ var init_screenshot = __esm({
|
|
|
776
801
|
const image = primary.captureImageSync();
|
|
777
802
|
const physW = image.width;
|
|
778
803
|
const physH = image.height;
|
|
779
|
-
const scaleFactor = primary.scaleFactor ?? 1;
|
|
804
|
+
const scaleFactor = primary.scaleFactor() ?? 1;
|
|
780
805
|
const logicalW = Math.round(physW / scaleFactor);
|
|
781
806
|
const logicalH = Math.round(physH / scaleFactor);
|
|
782
807
|
const raw = image.toRawSync();
|
|
@@ -1617,7 +1642,7 @@ init_types();
|
|
|
1617
1642
|
import { program } from "commander";
|
|
1618
1643
|
import { createInterface } from "readline";
|
|
1619
1644
|
import { createServer } from "http";
|
|
1620
|
-
import { mkdirSync as mkdirSync2, writeFileSync } from "fs";
|
|
1645
|
+
import { existsSync as existsSync4, mkdirSync as mkdirSync2, readFileSync as readFileSync3, writeFileSync } from "fs";
|
|
1621
1646
|
import { join as join3 } from "path";
|
|
1622
1647
|
import { tmpdir } from "os";
|
|
1623
1648
|
function startScreenshotServer(screenshotDir) {
|
|
@@ -1654,7 +1679,35 @@ function startScreenshotServer(screenshotDir) {
|
|
|
1654
1679
|
});
|
|
1655
1680
|
}
|
|
1656
1681
|
program.name("windows-use").description("Run Windows/browser automation tasks using a small LLM agent").version("0.2.0");
|
|
1657
|
-
program.command("init").description("Interactive setup
|
|
1682
|
+
program.command("init").description("Interactive setup, or import/export config via base64").argument("[base64]", "Import config from a base64 string").option("--export", "Export current config as a base64 string").action(async (base64Input, opts) => {
|
|
1683
|
+
const configPath = getConfigPath();
|
|
1684
|
+
if (opts.export) {
|
|
1685
|
+
if (!existsSync4(configPath)) {
|
|
1686
|
+
console.error("No config found. Run `windows-use init` first.");
|
|
1687
|
+
process.exit(1);
|
|
1688
|
+
}
|
|
1689
|
+
const raw = readFileSync3(configPath, "utf-8");
|
|
1690
|
+
const encoded = Buffer.from(raw).toString("base64");
|
|
1691
|
+
console.log(encoded);
|
|
1692
|
+
return;
|
|
1693
|
+
}
|
|
1694
|
+
if (base64Input) {
|
|
1695
|
+
try {
|
|
1696
|
+
const decoded = Buffer.from(base64Input, "base64").toString("utf-8");
|
|
1697
|
+
const parsed = JSON.parse(decoded);
|
|
1698
|
+
writeFileSync(configPath, JSON.stringify(parsed, null, 2) + "\n", "utf-8");
|
|
1699
|
+
console.log(`\u2705 Config imported to ${configPath}`);
|
|
1700
|
+
const display = { ...parsed };
|
|
1701
|
+
if (display.apiKey) {
|
|
1702
|
+
display.apiKey = display.apiKey.slice(0, 6) + "..." + display.apiKey.slice(-4);
|
|
1703
|
+
}
|
|
1704
|
+
console.log(JSON.stringify(display, null, 2));
|
|
1705
|
+
} catch {
|
|
1706
|
+
console.error("Invalid base64 or JSON. Make sure you copied the full string.");
|
|
1707
|
+
process.exit(1);
|
|
1708
|
+
}
|
|
1709
|
+
return;
|
|
1710
|
+
}
|
|
1658
1711
|
const rl = createInterface({ input: process.stdin, output: process.stdout });
|
|
1659
1712
|
const ask = (q) => new Promise((resolve) => rl.question(q, (a) => resolve(a.trim())));
|
|
1660
1713
|
console.log("\n\u{1F527} windows-use setup\n");
|
|
@@ -1666,7 +1719,6 @@ program.command("init").description("Interactive setup \u2014 save config to ~/.
|
|
|
1666
1719
|
if (baseURL) config.baseURL = baseURL;
|
|
1667
1720
|
if (apiKey) config.apiKey = apiKey;
|
|
1668
1721
|
if (model) config.model = model;
|
|
1669
|
-
const configPath = getConfigPath();
|
|
1670
1722
|
writeFileSync(configPath, JSON.stringify(config, null, 2) + "\n", "utf-8");
|
|
1671
1723
|
console.log(`
|
|
1672
1724
|
\u2705 Config saved to ${configPath}`);
|