npm - windows-use - Versions diffs - 0.3.1 → 0.3.2 - Mend

windows-use 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -107,6 +107,7 @@ type ToolResult = {
     base64: string;
     mimeType: 'image/png' | 'image/jpeg';
     screenshotId: string;
+    content?: string;
 } | {
     type: 'report';
     status: 'completed' | 'blocked' | 'need_guidance';

package/dist/index.js CHANGED Viewed

@@ -97,15 +97,18 @@ function buildSystemPrompt() {
 4. When the task is done, you are blocked, or you need guidance, call \`report\` immediately.
 ## Reading Screenshots
-- Desktop screenshots include a **coordinate grid overlay**. The grid labels show pixel coordinates that directly correspond to \`mouse_click\` and \`mouse_move\` coordinates.
-- Use the grid numbers to estimate the (x, y) position of UI elements. For example, if a button appears near the grid label "400" horizontally and "300" vertically, click at approximately (400, 300).
-- The bottom-right corner label shows the total screen dimensions.
+- Screenshots include a **coordinate grid overlay** with **numbered blue reference markers** at grid intersections.
+- Each screenshot also returns a **text coordinate table** mapping marker numbers to exact screen coordinates, e.g. \`[1](200,200) [2](400,200)\`.
+- **How to locate elements precisely**: Find the nearest blue numbered marker to your target in the image, look up its exact (x,y) from the coordinate table, then adjust for the offset.
+- Example: A button is just right of marker \`[7]\`. The table says \`[7](600,400)\`. The button is ~50px right \u2192 click at (650, 400).
+- The red edge labels and bottom-right dimension label are also available for reference.
 ## Tool Selection
 - **Browser tasks**: Prefer \`browser_*\` tools (they use CSS selectors, more reliable than coordinates). Use \`browser_content\` to find text/elements when you can't locate them visually.
 - **Desktop/native app tasks**: Use \`screenshot\` + \`mouse_click\`/\`keyboard_*\`. Read coordinates from the grid overlay.
 - **Terminal tasks**: Prefer \`run_command\` over GUI interactions. It's faster and more reliable.
 - **Mixed tasks**: You can combine all tool types. For example, use \`run_command\` to launch an app, then \`screenshot\` + mouse to interact with it.
+- **Window management**: Use \`list_windows\` to see all open windows, \`focus_window\` to activate a specific window, and \`window_screenshot\` to capture a specific window (coordinates in the grid are screen-absolute, matching \`mouse_click\`). Focus a window before sending keyboard/mouse input to it.
 ## Smart Screenshot Strategy
 - ALWAYS take a screenshot before your first action.
@@ -124,7 +127,12 @@ function buildSystemPrompt() {
 - **Popups/dialogs**: Handle unexpected dialogs (cookie banners, notifications, confirmations) by dismissing or accepting them, then continue with the original task.
 - **Dropdowns/menus**: Click to open, then screenshot to see options before selecting.
 - **Scrolling**: If content is below the fold, scroll down and screenshot. Check both browser_scroll (for web pages) and mouse_scroll (for desktop apps).
-- **Text input**: For browser forms, prefer \`browser_type\` with the CSS selector. For desktop apps, click the input field first, then use \`keyboard_type\`.
+- **Text input**:
+  - For browser forms, prefer \`browser_type\` with the CSS selector.
+  - For desktop apps, click the input field first, then type.
+  - Use \`clipboard_type\` (paste via clipboard) when: the text contains non-ASCII characters (Chinese, Japanese, etc.), the current IME might interfere, or you need fast input.
+  - Use \`keyboard_type\` (character-by-character) when: you need to trigger per-key events, or for simple ASCII text with English IME active.
+  - If \`keyboard_type\` produces garbled text, switch to \`clipboard_type\` or use \`switch_input_method\` to toggle the IME first.
 - **Coordinate precision**: When clicking small UI elements (buttons, links, checkboxes), aim for their center. If a click misses, adjust coordinates and try once more.
 ## Error Recovery
@@ -300,11 +308,13 @@ var AgentRunner = class {
         }
         if (result.type === "image") {
           this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: `Screenshot captured (${result.screenshotId})` });
+          const textPart = result.content ? `Screenshot captured. ID: ${result.screenshotId}
+${result.content}` : `Screenshot captured. ID: ${result.screenshotId}`;
           this.contextManager.append({
             role: "tool",
             tool_call_id: toolCall.id,
             content: [
-              { type: "text", text: `Screenshot captured. ID: ${result.screenshotId}` },
+              { type: "text", text: textPart },
               {
                 type: "image_url",
                 image_url: {
@@ -670,6 +680,8 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
   const gridSpacing = options.gridSpacing ?? 100;
   const labelSpacing = options.labelSpacing ?? 200;
   const majorSpacing = gridSpacing * 5;
+  const offsetX = options.offsetX ?? 0;
+  const offsetY = options.offsetY ?? 0;
   const svgParts = [];
   for (let x = gridSpacing; x < width; x += gridSpacing) {
     const isMajor = x % majorSpacing === 0;
@@ -687,8 +699,24 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
       `<line x1="0" y1="${y}" x2="${width}" y2="${y}" stroke="rgba(255,50,50,${opacity})" stroke-width="${sw}"/>`
     );
   }
+  const markers = [];
+  let markerId = 1;
+  for (let y = labelSpacing; y < height; y += labelSpacing) {
+    for (let x = labelSpacing; x < width; x += labelSpacing) {
+      const screenX = x + offsetX;
+      const screenY = y + offsetY;
+      markers.push({ id: markerId, screenX, screenY });
+      const label = String(markerId);
+      const r = label.length > 1 ? 12 : 10;
+      svgParts.push(
+        `<circle cx="${x}" cy="${y}" r="${r}" fill="rgba(0,110,255,0.85)" stroke="white" stroke-width="1"/>`,
+        `<text x="${x}" y="${y + 4}" text-anchor="middle" fill="white" font-size="${label.length > 1 ? 9 : 10}" font-family="Consolas,monospace" font-weight="bold">${label}</text>`
+      );
+      markerId++;
+    }
+  }
   for (let x = labelSpacing; x < width; x += labelSpacing) {
-    const text = String(x);
+    const text = String(x + offsetX);
     const tw = text.length * 7.5 + 6;
     svgParts.push(
       `<rect x="${x - tw / 2}" y="2" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
@@ -696,16 +724,18 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
     );
   }
   for (let y = labelSpacing; y < height; y += labelSpacing) {
-    const text = String(y);
+    const text = String(y + offsetY);
     const tw = text.length * 7.5 + 6;
     svgParts.push(
       `<rect x="2" y="${y - 8}" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
       `<text x="5" y="${y + 4}" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${text}</text>`
     );
   }
+  const originText = `${offsetX},${offsetY}`;
+  const originTw = originText.length * 7.5 + 6;
   svgParts.push(
-    `<rect x="2" y="2" width="22" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
-    `<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">0,0</text>`
+    `<rect x="2" y="2" width="${originTw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
+    `<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${originText}</text>`
   );
   const dimText = `${width}x${height}`;
   const dimTw = dimText.length * 7.5 + 6;
@@ -716,7 +746,16 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
   const svg = Buffer.from(
     `<svg width="${width}" height="${height}" xmlns="http://www.w3.org/2000/svg">${svgParts.join("")}</svg>`
   );
-  return sharp(imageBuffer).composite([{ input: svg, top: 0, left: 0 }]).jpeg({ quality: 70 }).toBuffer();
+  const image = await sharp(imageBuffer).composite([{ input: svg, top: 0, left: 0 }]).jpeg({ quality: 70 }).toBuffer();
+  const cols = Math.floor((width - 1) / labelSpacing);
+  const rows = [];
+  for (let i = 0; i < markers.length; i += cols) {
+    const row = markers.slice(i, i + cols).map((m) => `[${m.id}](${m.screenX},${m.screenY})`).join(" ");
+    rows.push(row);
+  }
+  const gridRef = `Grid reference points (marker \u2192 screen coordinates for mouse_click):
+${rows.join("\n")}`;
+  return { image, gridRef };
 }
 // src/tools/windows/screenshot.ts
@@ -743,13 +782,14 @@ var screenshotTool = {
     }).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
     const cleanBase64 = resized.toString("base64");
     const id = ctx.screenshots.save(cleanBase64, "image/jpeg", "desktop");
-    const gridImage = await addCoordinateGrid(resized, logicalW, logicalH);
+    const { image: gridImage, gridRef } = await addCoordinateGrid(resized, logicalW, logicalH);
     const gridBase64 = gridImage.toString("base64");
     return {
       type: "image",
       base64: gridBase64,
       mimeType: "image/jpeg",
-      screenshotId: id
+      screenshotId: id,
+      content: gridRef
     };
   }
 };
@@ -895,16 +935,179 @@ var keyboardPressTool = {
   }
 };
-// src/tools/windows/command.ts
+// src/tools/windows/clipboard.ts
 import { z as z5 } from "zod";
+async function getNutJs3() {
+  return import("@nut-tree-fork/nut-js");
+}
+var clipboardTypeTool = {
+  name: "clipboard_type",
+  description: "Type text by copying it to the clipboard and pasting (Ctrl+V). This bypasses input method (IME) issues and is faster than keyboard_type. Use this when the current IME might interfere, or for non-ASCII text (Chinese, Japanese, etc.).",
+  parameters: z5.object({
+    text: z5.string().describe("The text to paste")
+  }),
+  async execute(args) {
+    const nut = await getNutJs3();
+    await nut.clipboard.setContent(args.text);
+    await nut.keyboard.pressKey(nut.Key.LeftControl, nut.Key.V);
+    await nut.keyboard.releaseKey(nut.Key.LeftControl, nut.Key.V);
+    return { type: "text", content: `Pasted: "${args.text}"` };
+  }
+};
+var switchInputMethodTool = {
+  name: "switch_input_method",
+  description: "Toggle the input method (IME) by pressing Win+Space. Use this before keyboard_type if the current IME is wrong. Take a screenshot afterward to verify the switch.",
+  parameters: z5.object({}),
+  async execute() {
+    const nut = await getNutJs3();
+    await nut.keyboard.pressKey(nut.Key.LeftWin, nut.Key.Space);
+    await nut.keyboard.releaseKey(nut.Key.LeftWin, nut.Key.Space);
+    return { type: "text", content: "Toggled input method (Win+Space)" };
+  }
+};
+// src/tools/windows/window.ts
+import { z as z6 } from "zod";
+import sharp3 from "sharp";
+async function getNutJs4() {
+  return import("@nut-tree-fork/nut-js");
+}
+async function getNodeScreenshots() {
+  return import("node-screenshots");
+}
+function findWindowByTitle(windows, title) {
+  const lower = title.toLowerCase();
+  const exact = windows.find(
+    (w) => w.title().toLowerCase() === lower
+  );
+  if (exact) return exact;
+  return windows.find(
+    (w) => w.title().toLowerCase().includes(lower)
+  );
+}
+var listWindowsTool = {
+  name: "list_windows",
+  description: "List all visible windows with their titles, positions, and sizes.",
+  parameters: z6.object({}),
+  async execute() {
+    const { Window } = await getNodeScreenshots();
+    const windows = Window.all();
+    const list = windows.filter((w) => w.title().trim().length > 0).map((w) => ({
+      id: w.id(),
+      title: w.title(),
+      appName: w.appName(),
+      x: w.x(),
+      y: w.y(),
+      width: w.width(),
+      height: w.height(),
+      isMinimized: w.isMinimized(),
+      isFocused: w.isFocused()
+    }));
+    const formatted = list.map(
+      (w) => `[${w.isFocused ? "*" : " "}] "${w.title}" (${w.appName}) \u2014 pos:(${w.x},${w.y}) size:${w.width}x${w.height}${w.isMinimized ? " [minimized]" : ""}`
+    ).join("\n");
+    return {
+      type: "text",
+      content: `Found ${list.length} windows:
+${formatted}`
+    };
+  }
+};
+var focusWindowTool = {
+  name: "focus_window",
+  description: "Focus (activate) a window by its title. Uses partial, case-insensitive matching.",
+  parameters: z6.object({
+    title: z6.string().describe("Window title to search for (partial match)")
+  }),
+  async execute(args) {
+    const nut = await getNutJs4();
+    const windows = await nut.getWindows();
+    const lower = args.title.toLowerCase();
+    let target = null;
+    for (const w of windows) {
+      const t = await w.title;
+      if (t.toLowerCase() === lower) {
+        target = w;
+        break;
+      }
+      if (!target && t.toLowerCase().includes(lower)) {
+        target = w;
+      }
+    }
+    if (!target) {
+      return {
+        type: "text",
+        content: `Error: No window found matching "${args.title}"`
+      };
+    }
+    const title = await target.title;
+    await target.focus();
+    return { type: "text", content: `Focused window: "${title}"` };
+  }
+};
+var windowScreenshotTool = {
+  name: "window_screenshot",
+  description: "Capture a screenshot of a specific window by its title. The coordinate grid shows screen-absolute coordinates (matching mouse_click). Returns a screenshot ID.",
+  parameters: z6.object({
+    title: z6.string().describe("Window title to search for (partial match)")
+  }),
+  async execute(args, ctx) {
+    const { Window, Monitor } = await getNodeScreenshots();
+    const windows = Window.all().filter(
+      (w) => w.title().trim().length > 0
+    );
+    const target = findWindowByTitle(windows, args.title);
+    if (!target) {
+      return {
+        type: "text",
+        content: `Error: No window found matching "${args.title}"`
+      };
+    }
+    const winTitle = target.title();
+    const winX = target.x();
+    const winY = target.y();
+    const image = target.captureImageSync();
+    const physW = image.width;
+    const physH = image.height;
+    const monitor = target.currentMonitor();
+    const scaleFactor = monitor ? monitor.scaleFactor() : 1;
+    const logicalW = Math.round(physW / scaleFactor);
+    const logicalH = Math.round(physH / scaleFactor);
+    const raw = image.toRawSync();
+    const resized = await sharp3(raw, {
+      raw: { width: physW, height: physH, channels: 4 }
+    }).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
+    const cleanBase64 = resized.toString("base64");
+    const id = ctx.screenshots.save(
+      cleanBase64,
+      "image/jpeg",
+      `window: ${winTitle}`
+    );
+    const { image: gridImage, gridRef } = await addCoordinateGrid(resized, logicalW, logicalH, {
+      offsetX: winX,
+      offsetY: winY
+    });
+    const gridBase64 = gridImage.toString("base64");
+    return {
+      type: "image",
+      base64: gridBase64,
+      mimeType: "image/jpeg",
+      screenshotId: id,
+      content: gridRef
+    };
+  }
+};
+// src/tools/windows/command.ts
+import { z as z7 } from "zod";
 import { exec } from "child_process";
 var MAX_OUTPUT_LENGTH = 1e4;
 var runCommandTool = {
   name: "run_command",
   description: "Execute a shell command and return its output. Uses PowerShell on Windows.",
-  parameters: z5.object({
-    command: z5.string().describe("The command to execute"),
-    timeout: z5.number().positive().default(3e4).describe("Timeout in milliseconds")
+  parameters: z7.object({
+    command: z7.string().describe("The command to execute"),
+    timeout: z7.number().positive().default(3e4).describe("Timeout in milliseconds")
   }),
   async execute(args) {
     return new Promise((resolve) => {
@@ -935,14 +1138,14 @@ var runCommandTool = {
 };
 // src/tools/file/read.ts
-import { z as z6 } from "zod";
+import { z as z8 } from "zod";
 import { readFile } from "fs/promises";
 var MAX_FILE_SIZE = 1e5;
 var fileReadTool = {
   name: "file_read",
   description: "Read the contents of a file at the given path.",
-  parameters: z6.object({
-    path: z6.string().describe("Absolute path to the file")
+  parameters: z8.object({
+    path: z8.string().describe("Absolute path to the file")
   }),
   async execute(args) {
     try {
@@ -962,15 +1165,15 @@ var fileReadTool = {
 };
 // src/tools/file/write.ts
-import { z as z7 } from "zod";
+import { z as z9 } from "zod";
 import { writeFile, mkdir } from "fs/promises";
 import { dirname } from "path";
 var fileWriteTool = {
   name: "file_write",
   description: "Write content to a file at the given path. Creates parent directories if needed.",
-  parameters: z7.object({
-    path: z7.string().describe("Absolute path to the file"),
-    content: z7.string().describe("Content to write")
+  parameters: z9.object({
+    path: z9.string().describe("Absolute path to the file"),
+    content: z9.string().describe("Content to write")
   }),
   async execute(args) {
     try {
@@ -985,16 +1188,16 @@ var fileWriteTool = {
 };
 // src/tools/file/image.ts
-import { z as z8 } from "zod";
+import { z as z10 } from "zod";
 import { readFileSync as readFileSync2, existsSync as existsSync3 } from "fs";
 import { extname } from "path";
 var IMAGE_EXTS = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".bmp", ".webp"]);
 var useLocalImageTool = {
   name: "use_local_image",
   description: "Load a local image file and get a screenshot ID for it. Use this to reference local images in your report via [Image:img_X].",
-  parameters: z8.object({
-    path: z8.string().describe("Absolute path to the image file"),
-    label: z8.string().default("local").describe('Label for the image (e.g. "chart", "photo")')
+  parameters: z10.object({
+    path: z10.string().describe("Absolute path to the image file"),
+    label: z10.string().default("local").describe('Label for the image (e.g. "chart", "photo")')
   }),
   async execute(args, ctx) {
     if (!existsSync3(args.path)) {
@@ -1018,12 +1221,12 @@ var useLocalImageTool = {
 };
 // src/tools/browser/navigate.ts
-import { z as z9 } from "zod";
+import { z as z11 } from "zod";
 var browserNavigateTool = {
   name: "browser_navigate",
   description: "Navigate the browser to a URL.",
-  parameters: z9.object({
-    url: z9.string().describe("The URL to navigate to")
+  parameters: z11.object({
+    url: z11.string().describe("The URL to navigate to")
   }),
   async execute(args, ctx) {
     const browser = await ctx.getBrowser();
@@ -1036,12 +1239,12 @@ Page title: ${title}` };
 };
 // src/tools/browser/click.ts
-import { z as z10 } from "zod";
+import { z as z12 } from "zod";
 var browserClickTool = {
   name: "browser_click",
   description: "Click an element on the web page using a CSS selector or text content.",
-  parameters: z10.object({
-    selector: z10.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
+  parameters: z12.object({
+    selector: z12.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
   }),
   async execute(args, ctx) {
     const browser = await ctx.getBrowser();
@@ -1052,14 +1255,14 @@ var browserClickTool = {
 };
 // src/tools/browser/type.ts
-import { z as z11 } from "zod";
+import { z as z13 } from "zod";
 var browserTypeTool = {
   name: "browser_type",
   description: "Type text into an input field on the web page.",
-  parameters: z11.object({
-    selector: z11.string().describe("CSS selector for the input element"),
-    text: z11.string().describe("Text to type"),
-    clear: z11.boolean().default(true).describe("Whether to clear the field before typing")
+  parameters: z13.object({
+    selector: z13.string().describe("CSS selector for the input element"),
+    text: z13.string().describe("Text to type"),
+    clear: z13.boolean().default(true).describe("Whether to clear the field before typing")
   }),
   async execute(args, ctx) {
     const browser = await ctx.getBrowser();
@@ -1074,12 +1277,12 @@ var browserTypeTool = {
 };
 // src/tools/browser/screenshot.ts
-import { z as z12 } from "zod";
+import { z as z14 } from "zod";
 var browserScreenshotTool = {
   name: "browser_screenshot",
   description: "Take a screenshot of the current browser page. Returns a screenshot ID (e.g. img_2) that you can reference later in report.",
-  parameters: z12.object({
-    fullPage: z12.boolean().default(false).describe("Whether to capture the full scrollable page")
+  parameters: z14.object({
+    fullPage: z14.boolean().default(false).describe("Whether to capture the full scrollable page")
   }),
   async execute(args, ctx) {
     const browser = await ctx.getBrowser();
@@ -1102,12 +1305,12 @@ var browserScreenshotTool = {
 };
 // src/tools/browser/content.ts
-import { z as z13 } from "zod";
+import { z as z15 } from "zod";
 var MAX_CONTENT_LENGTH = 2e4;
 var browserContentTool = {
   name: "browser_content",
   description: "Get the text content of the current web page. Returns visible text, not HTML.",
-  parameters: z13.object({}),
+  parameters: z15.object({}),
   async execute(_args, ctx) {
     const browser = await ctx.getBrowser();
     const page = await browser.getPage();
@@ -1128,13 +1331,13 @@ ${text}`
 };
 // src/tools/browser/scroll.ts
-import { z as z14 } from "zod";
+import { z as z16 } from "zod";
 var browserScrollTool = {
   name: "browser_scroll",
   description: "Scroll the current web page.",
-  parameters: z14.object({
-    direction: z14.enum(["up", "down"]).describe("Scroll direction"),
-    amount: z14.number().positive().default(500).describe("Pixels to scroll")
+  parameters: z16.object({
+    direction: z16.enum(["up", "down"]).describe("Scroll direction"),
+    amount: z16.number().positive().default(500).describe("Pixels to scroll")
   }),
   async execute(args, ctx) {
     const browser = await ctx.getBrowser();
@@ -1146,16 +1349,16 @@ var browserScrollTool = {
 };
 // src/tools/control/report.ts
-import { z as z15 } from "zod";
+import { z as z17 } from "zod";
 var reportTool = {
   name: "report",
   description: 'Report progress back to the caller. Call this when the task is completed, when you are blocked, or when you need guidance. Calling this STOPS your execution immediately.\n\nThe content field supports rich document format: mix text with screenshots using [Image:img_1] markers. Example:\n"Here is the current state:\n[Image:img_2]\nThe page shows..."',
-  parameters: z15.object({
-    status: z15.enum(["completed", "blocked", "need_guidance"]).describe(
+  parameters: z17.object({
+    status: z17.enum(["completed", "blocked", "need_guidance"]).describe(
       '"completed" = task done, "blocked" = cannot proceed, "need_guidance" = need a decision'
     ),
-    content: z15.string().describe('Rich report content. Use [Image:img_X] to embed screenshots captured earlier. Example: "Task done.\\n[Image:img_1]\\nThe page shows the result."'),
-    data: z15.unknown().optional().describe("Optional structured data to return")
+    content: z17.string().describe('Rich report content. Use [Image:img_X] to embed screenshots captured earlier. Example: "Task done.\\n[Image:img_1]\\nThe page shows the result."'),
+    data: z17.unknown().optional().describe("Optional structured data to return")
   }),
   async execute(args) {
     return {
@@ -1176,6 +1379,11 @@ function createToolRegistry() {
   registry.register(mouseScrollTool);
   registry.register(keyboardTypeTool);
   registry.register(keyboardPressTool);
+  registry.register(clipboardTypeTool);
+  registry.register(switchInputMethodTool);
+  registry.register(listWindowsTool);
+  registry.register(focusWindowTool);
+  registry.register(windowScreenshotTool);
   registry.register(runCommandTool);
   registry.register(fileReadTool);
   registry.register(fileWriteTool);