windows-use 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -120,21 +120,49 @@ function buildSystemPrompt() {
120
120
  return `You are a precise Windows and browser automation agent. Your job is to execute instructions by calling the tools available to you.
121
121
 
122
122
  ## Workflow
123
- 1. Take a screenshot first to understand the current state of the screen.
123
+ 1. Take a \`screenshot\` first to understand the current state of the screen.
124
124
  2. Plan the minimal sequence of actions needed.
125
- 3. Execute each action one at a time, then verify by taking another screenshot.
125
+ 3. Execute actions, verifying with screenshots at key checkpoints (not after every single action).
126
126
  4. When the task is done, you are blocked, or you need guidance, call \`report\` immediately.
127
127
 
128
+ ## Reading Screenshots
129
+ - Desktop screenshots include a **coordinate grid overlay**. The grid labels show pixel coordinates that directly correspond to \`mouse_click\` and \`mouse_move\` coordinates.
130
+ - Use the grid numbers to estimate the (x, y) position of UI elements. For example, if a button appears near the grid label "400" horizontally and "300" vertically, click at approximately (400, 300).
131
+ - The bottom-right corner label shows the total screen dimensions.
132
+
133
+ ## Tool Selection
134
+ - **Browser tasks**: Prefer \`browser_*\` tools (they use CSS selectors, more reliable than coordinates). Use \`browser_content\` to find text/elements when you can't locate them visually.
135
+ - **Desktop/native app tasks**: Use \`screenshot\` + \`mouse_click\`/\`keyboard_*\`. Read coordinates from the grid overlay.
136
+ - **Terminal tasks**: Prefer \`run_command\` over GUI interactions. It's faster and more reliable.
137
+ - **Mixed tasks**: You can combine all tool types. For example, use \`run_command\` to launch an app, then \`screenshot\` + mouse to interact with it.
138
+
139
+ ## Smart Screenshot Strategy
140
+ - ALWAYS take a screenshot before your first action.
141
+ - Take a screenshot to verify after **critical actions** (clicking a button, submitting a form, navigating to a new page).
142
+ - Skip verification screenshots for **low-risk sequential actions** (typing text, pressing modifier keys, scrolling) \u2014 verify after the sequence is complete instead.
143
+ - If an action might trigger a loading state, wait briefly then screenshot to confirm the page has loaded.
144
+
128
145
  ## Rules
129
- - ALWAYS take a screenshot before your first action to understand the current state.
130
- - After every mouse click or keyboard action, take a screenshot to verify the result.
131
146
  - Call ONE tool at a time. Never request multiple tools in parallel.
132
147
  - Before each tool call, briefly state what you are about to do and why.
133
148
  - After receiving a tool result, describe what you observed.
134
- - For browser tasks, prefer using browser_* tools over clicking on-screen coordinates.
135
- - For terminal tasks, prefer \`run_command\` over GUI interactions when possible.
136
149
  - Do not read or write files unless the instruction explicitly asks for it.
137
150
 
151
+ ## Handling Common Situations
152
+ - **Loading/transitions**: If a page or app is loading, take another screenshot after a moment instead of acting immediately.
153
+ - **Popups/dialogs**: Handle unexpected dialogs (cookie banners, notifications, confirmations) by dismissing or accepting them, then continue with the original task.
154
+ - **Dropdowns/menus**: Click to open, then screenshot to see options before selecting.
155
+ - **Scrolling**: If content is below the fold, scroll down and screenshot. Check both browser_scroll (for web pages) and mouse_scroll (for desktop apps).
156
+ - **Text input**: For browser forms, prefer \`browser_type\` with the CSS selector. For desktop apps, click the input field first, then use \`keyboard_type\`.
157
+ - **Coordinate precision**: When clicking small UI elements (buttons, links, checkboxes), aim for their center. If a click misses, adjust coordinates and try once more.
158
+
159
+ ## Error Recovery
160
+ - If an action fails or produces unexpected results, take a screenshot to reassess the situation before trying again.
161
+ - Try a different approach rather than repeating the same failed action. For example:
162
+ - If \`browser_click\` fails on a selector, try a different selector or fall back to coordinate-based \`mouse_click\`.
163
+ - If a UI element is not visible, try scrolling or switching tabs/windows.
164
+ - If something fails **twice with different approaches**, call \`report\` with status "blocked".
165
+
138
166
  ## report Tool
139
167
  Call \`report\` when:
140
168
  - **"completed"**: The task is done successfully. Summarize what was accomplished.
@@ -150,12 +178,9 @@ report({
150
178
  })
151
179
  \`\`\`
152
180
 
153
- Each screenshot tool returns a screenshot ID (e.g. img_1, img_2). Use these IDs to embed images in your report.
181
+ Each screenshot tool returns a screenshot ID (e.g. img_1, img_2). Use these IDs to embed images in your report. Include relevant screenshots in your report so the caller can see the final state.
154
182
 
155
- ## Important
156
- - Do NOT keep retrying the same failing action. If something fails twice, call \`report\` with status "blocked".
157
- - If a UI element is not where you expect it, try scrolling first before giving up.
158
- - Keep your responses concise. Focus on actions, not explanations.`;
183
+ You can also use \`use_local_image\` to load a local image file and get a screenshot ID for embedding in reports.`;
159
184
  }
160
185
  var init_system_prompt = __esm({
161
186
  "src/agent/system-prompt.ts"() {
@@ -776,7 +801,7 @@ var init_screenshot = __esm({
776
801
  const image = primary.captureImageSync();
777
802
  const physW = image.width;
778
803
  const physH = image.height;
779
- const scaleFactor = primary.scaleFactor ?? 1;
804
+ const scaleFactor = primary.scaleFactor() ?? 1;
780
805
  const logicalW = Math.round(physW / scaleFactor);
781
806
  const logicalH = Math.round(physH / scaleFactor);
782
807
  const raw = image.toRawSync();
@@ -1617,7 +1642,7 @@ init_types();
1617
1642
  import { program } from "commander";
1618
1643
  import { createInterface } from "readline";
1619
1644
  import { createServer } from "http";
1620
- import { mkdirSync as mkdirSync2, writeFileSync } from "fs";
1645
+ import { existsSync as existsSync4, mkdirSync as mkdirSync2, readFileSync as readFileSync3, writeFileSync } from "fs";
1621
1646
  import { join as join3 } from "path";
1622
1647
  import { tmpdir } from "os";
1623
1648
  function startScreenshotServer(screenshotDir) {
@@ -1654,7 +1679,35 @@ function startScreenshotServer(screenshotDir) {
1654
1679
  });
1655
1680
  }
1656
1681
  program.name("windows-use").description("Run Windows/browser automation tasks using a small LLM agent").version("0.2.0");
1657
- program.command("init").description("Interactive setup \u2014 save config to ~/.windows-use.json").action(async () => {
1682
+ program.command("init").description("Interactive setup, or import/export config via base64").argument("[base64]", "Import config from a base64 string").option("--export", "Export current config as a base64 string").action(async (base64Input, opts) => {
1683
+ const configPath = getConfigPath();
1684
+ if (opts.export) {
1685
+ if (!existsSync4(configPath)) {
1686
+ console.error("No config found. Run `windows-use init` first.");
1687
+ process.exit(1);
1688
+ }
1689
+ const raw = readFileSync3(configPath, "utf-8");
1690
+ const encoded = Buffer.from(raw).toString("base64");
1691
+ console.log(encoded);
1692
+ return;
1693
+ }
1694
+ if (base64Input) {
1695
+ try {
1696
+ const decoded = Buffer.from(base64Input, "base64").toString("utf-8");
1697
+ const parsed = JSON.parse(decoded);
1698
+ writeFileSync(configPath, JSON.stringify(parsed, null, 2) + "\n", "utf-8");
1699
+ console.log(`\u2705 Config imported to ${configPath}`);
1700
+ const display = { ...parsed };
1701
+ if (display.apiKey) {
1702
+ display.apiKey = display.apiKey.slice(0, 6) + "..." + display.apiKey.slice(-4);
1703
+ }
1704
+ console.log(JSON.stringify(display, null, 2));
1705
+ } catch {
1706
+ console.error("Invalid base64 or JSON. Make sure you copied the full string.");
1707
+ process.exit(1);
1708
+ }
1709
+ return;
1710
+ }
1658
1711
  const rl = createInterface({ input: process.stdin, output: process.stdout });
1659
1712
  const ask = (q) => new Promise((resolve) => rl.question(q, (a) => resolve(a.trim())));
1660
1713
  console.log("\n\u{1F527} windows-use setup\n");
@@ -1666,7 +1719,6 @@ program.command("init").description("Interactive setup \u2014 save config to ~/.
1666
1719
  if (baseURL) config.baseURL = baseURL;
1667
1720
  if (apiKey) config.apiKey = apiKey;
1668
1721
  if (model) config.model = model;
1669
- const configPath = getConfigPath();
1670
1722
  writeFileSync(configPath, JSON.stringify(config, null, 2) + "\n", "utf-8");
1671
1723
  console.log(`
1672
1724
  \u2705 Config saved to ${configPath}`);