npm - windows-use - Versions diffs - 0.3.1 → 0.3.2 - Mend

windows-use 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/cli.js CHANGED Viewed

@@ -126,15 +126,18 @@ function buildSystemPrompt() {
 4. When the task is done, you are blocked, or you need guidance, call \`report\` immediately.
 ## Reading Screenshots
-- Desktop screenshots include a **coordinate grid overlay**. The grid labels show pixel coordinates that directly correspond to \`mouse_click\` and \`mouse_move\` coordinates.
-- Use the grid numbers to estimate the (x, y) position of UI elements. For example, if a button appears near the grid label "400" horizontally and "300" vertically, click at approximately (400, 300).
-- The bottom-right corner label shows the total screen dimensions.
+- Screenshots include a **coordinate grid overlay** with **numbered blue reference markers** at grid intersections.
+- Each screenshot also returns a **text coordinate table** mapping marker numbers to exact screen coordinates, e.g. \`[1](200,200) [2](400,200)\`.
+- **How to locate elements precisely**: Find the nearest blue numbered marker to your target in the image, look up its exact (x,y) from the coordinate table, then adjust for the offset.
+- Example: A button is just right of marker \`[7]\`. The table says \`[7](600,400)\`. The button is ~50px right \u2192 click at (650, 400).
+- The red edge labels and bottom-right dimension label are also available for reference.
 ## Tool Selection
 - **Browser tasks**: Prefer \`browser_*\` tools (they use CSS selectors, more reliable than coordinates). Use \`browser_content\` to find text/elements when you can't locate them visually.
 - **Desktop/native app tasks**: Use \`screenshot\` + \`mouse_click\`/\`keyboard_*\`. Read coordinates from the grid overlay.
 - **Terminal tasks**: Prefer \`run_command\` over GUI interactions. It's faster and more reliable.
 - **Mixed tasks**: You can combine all tool types. For example, use \`run_command\` to launch an app, then \`screenshot\` + mouse to interact with it.
+- **Window management**: Use \`list_windows\` to see all open windows, \`focus_window\` to activate a specific window, and \`window_screenshot\` to capture a specific window (coordinates in the grid are screen-absolute, matching \`mouse_click\`). Focus a window before sending keyboard/mouse input to it.
 ## Smart Screenshot Strategy
 - ALWAYS take a screenshot before your first action.
@@ -153,7 +156,12 @@ function buildSystemPrompt() {
 - **Popups/dialogs**: Handle unexpected dialogs (cookie banners, notifications, confirmations) by dismissing or accepting them, then continue with the original task.
 - **Dropdowns/menus**: Click to open, then screenshot to see options before selecting.
 - **Scrolling**: If content is below the fold, scroll down and screenshot. Check both browser_scroll (for web pages) and mouse_scroll (for desktop apps).
-- **Text input**: For browser forms, prefer \`browser_type\` with the CSS selector. For desktop apps, click the input field first, then use \`keyboard_type\`.
+- **Text input**:
+  - For browser forms, prefer \`browser_type\` with the CSS selector.
+  - For desktop apps, click the input field first, then type.
+  - Use \`clipboard_type\` (paste via clipboard) when: the text contains non-ASCII characters (Chinese, Japanese, etc.), the current IME might interfere, or you need fast input.
+  - Use \`keyboard_type\` (character-by-character) when: you need to trigger per-key events, or for simple ASCII text with English IME active.
+  - If \`keyboard_type\` produces garbled text, switch to \`clipboard_type\` or use \`switch_input_method\` to toggle the IME first.
 - **Coordinate precision**: When clicking small UI elements (buttons, links, checkboxes), aim for their center. If a click misses, adjust coordinates and try once more.
 ## Error Recovery
@@ -339,11 +347,13 @@ var init_runner = __esm({
             }
             if (result.type === "image") {
               this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: `Screenshot captured (${result.screenshotId})` });
+              const textPart = result.content ? `Screenshot captured. ID: ${result.screenshotId}
+${result.content}` : `Screenshot captured. ID: ${result.screenshotId}`;
               this.contextManager.append({
                 role: "tool",
                 tool_call_id: toolCall.id,
                 content: [
-                  { type: "text", text: `Screenshot captured. ID: ${result.screenshotId}` },
+                  { type: "text", text: textPart },
                   {
                     type: "image_url",
                     image_url: {
@@ -725,6 +735,8 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
   const gridSpacing = options.gridSpacing ?? 100;
   const labelSpacing = options.labelSpacing ?? 200;
   const majorSpacing = gridSpacing * 5;
+  const offsetX = options.offsetX ?? 0;
+  const offsetY = options.offsetY ?? 0;
   const svgParts = [];
   for (let x = gridSpacing; x < width; x += gridSpacing) {
     const isMajor = x % majorSpacing === 0;
@@ -742,8 +754,24 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
       `<line x1="0" y1="${y}" x2="${width}" y2="${y}" stroke="rgba(255,50,50,${opacity})" stroke-width="${sw}"/>`
     );
   }
+  const markers = [];
+  let markerId = 1;
+  for (let y = labelSpacing; y < height; y += labelSpacing) {
+    for (let x = labelSpacing; x < width; x += labelSpacing) {
+      const screenX = x + offsetX;
+      const screenY = y + offsetY;
+      markers.push({ id: markerId, screenX, screenY });
+      const label = String(markerId);
+      const r = label.length > 1 ? 12 : 10;
+      svgParts.push(
+        `<circle cx="${x}" cy="${y}" r="${r}" fill="rgba(0,110,255,0.85)" stroke="white" stroke-width="1"/>`,
+        `<text x="${x}" y="${y + 4}" text-anchor="middle" fill="white" font-size="${label.length > 1 ? 9 : 10}" font-family="Consolas,monospace" font-weight="bold">${label}</text>`
+      );
+      markerId++;
+    }
+  }
   for (let x = labelSpacing; x < width; x += labelSpacing) {
-    const text = String(x);
+    const text = String(x + offsetX);
     const tw = text.length * 7.5 + 6;
     svgParts.push(
       `<rect x="${x - tw / 2}" y="2" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
@@ -751,16 +779,18 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
     );
   }
   for (let y = labelSpacing; y < height; y += labelSpacing) {
-    const text = String(y);
+    const text = String(y + offsetY);
     const tw = text.length * 7.5 + 6;
     svgParts.push(
       `<rect x="2" y="${y - 8}" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
       `<text x="5" y="${y + 4}" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${text}</text>`
     );
   }
+  const originText = `${offsetX},${offsetY}`;
+  const originTw = originText.length * 7.5 + 6;
   svgParts.push(
-    `<rect x="2" y="2" width="22" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
-    `<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">0,0</text>`
+    `<rect x="2" y="2" width="${originTw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
+    `<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${originText}</text>`
   );
   const dimText = `${width}x${height}`;
   const dimTw = dimText.length * 7.5 + 6;
@@ -771,7 +801,16 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
   const svg = Buffer.from(
     `<svg width="${width}" height="${height}" xmlns="http://www.w3.org/2000/svg">${svgParts.join("")}</svg>`
   );
-  return sharp(imageBuffer).composite([{ input: svg, top: 0, left: 0 }]).jpeg({ quality: 70 }).toBuffer();
+  const image = await sharp(imageBuffer).composite([{ input: svg, top: 0, left: 0 }]).jpeg({ quality: 70 }).toBuffer();
+  const cols = Math.floor((width - 1) / labelSpacing);
+  const rows = [];
+  for (let i = 0; i < markers.length; i += cols) {
+    const row = markers.slice(i, i + cols).map((m) => `[${m.id}](${m.screenX},${m.screenY})`).join(" ");
+    rows.push(row);
+  }
+  const gridRef = `Grid reference points (marker \u2192 screen coordinates for mouse_click):
+${rows.join("\n")}`;
+  return { image, gridRef };
 }
 var init_grid_overlay = __esm({
   "src/tools/windows/grid-overlay.ts"() {
@@ -810,13 +849,14 @@ var init_screenshot = __esm({
         }).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
         const cleanBase64 = resized.toString("base64");
         const id = ctx.screenshots.save(cleanBase64, "image/jpeg", "desktop");
-        const gridImage = await addCoordinateGrid(resized, logicalW, logicalH);
+        const { image: gridImage, gridRef } = await addCoordinateGrid(resized, logicalW, logicalH);
         const gridBase64 = gridImage.toString("base64");
         return {
           type: "image",
           base64: gridBase64,
           mimeType: "image/jpeg",
-          screenshotId: id
+          screenshotId: id,
+          content: gridRef
         };
       }
     };
@@ -976,8 +1016,184 @@ var init_keyboard = __esm({
   }
 });
-// src/tools/windows/command.ts
+// src/tools/windows/clipboard.ts
 import { z as z5 } from "zod";
+async function getNutJs3() {
+  return import("@nut-tree-fork/nut-js");
+}
+var clipboardTypeTool, switchInputMethodTool;
+var init_clipboard = __esm({
+  "src/tools/windows/clipboard.ts"() {
+    "use strict";
+    clipboardTypeTool = {
+      name: "clipboard_type",
+      description: "Type text by copying it to the clipboard and pasting (Ctrl+V). This bypasses input method (IME) issues and is faster than keyboard_type. Use this when the current IME might interfere, or for non-ASCII text (Chinese, Japanese, etc.).",
+      parameters: z5.object({
+        text: z5.string().describe("The text to paste")
+      }),
+      async execute(args) {
+        const nut = await getNutJs3();
+        await nut.clipboard.setContent(args.text);
+        await nut.keyboard.pressKey(nut.Key.LeftControl, nut.Key.V);
+        await nut.keyboard.releaseKey(nut.Key.LeftControl, nut.Key.V);
+        return { type: "text", content: `Pasted: "${args.text}"` };
+      }
+    };
+    switchInputMethodTool = {
+      name: "switch_input_method",
+      description: "Toggle the input method (IME) by pressing Win+Space. Use this before keyboard_type if the current IME is wrong. Take a screenshot afterward to verify the switch.",
+      parameters: z5.object({}),
+      async execute() {
+        const nut = await getNutJs3();
+        await nut.keyboard.pressKey(nut.Key.LeftWin, nut.Key.Space);
+        await nut.keyboard.releaseKey(nut.Key.LeftWin, nut.Key.Space);
+        return { type: "text", content: "Toggled input method (Win+Space)" };
+      }
+    };
+  }
+});
+// src/tools/windows/window.ts
+import { z as z6 } from "zod";
+import sharp3 from "sharp";
+async function getNutJs4() {
+  return import("@nut-tree-fork/nut-js");
+}
+async function getNodeScreenshots() {
+  return import("node-screenshots");
+}
+function findWindowByTitle(windows, title) {
+  const lower = title.toLowerCase();
+  const exact = windows.find(
+    (w) => w.title().toLowerCase() === lower
+  );
+  if (exact) return exact;
+  return windows.find(
+    (w) => w.title().toLowerCase().includes(lower)
+  );
+}
+var listWindowsTool, focusWindowTool, windowScreenshotTool;
+var init_window = __esm({
+  "src/tools/windows/window.ts"() {
+    "use strict";
+    init_grid_overlay();
+    listWindowsTool = {
+      name: "list_windows",
+      description: "List all visible windows with their titles, positions, and sizes.",
+      parameters: z6.object({}),
+      async execute() {
+        const { Window } = await getNodeScreenshots();
+        const windows = Window.all();
+        const list = windows.filter((w) => w.title().trim().length > 0).map((w) => ({
+          id: w.id(),
+          title: w.title(),
+          appName: w.appName(),
+          x: w.x(),
+          y: w.y(),
+          width: w.width(),
+          height: w.height(),
+          isMinimized: w.isMinimized(),
+          isFocused: w.isFocused()
+        }));
+        const formatted = list.map(
+          (w) => `[${w.isFocused ? "*" : " "}] "${w.title}" (${w.appName}) \u2014 pos:(${w.x},${w.y}) size:${w.width}x${w.height}${w.isMinimized ? " [minimized]" : ""}`
+        ).join("\n");
+        return {
+          type: "text",
+          content: `Found ${list.length} windows:
+${formatted}`
+        };
+      }
+    };
+    focusWindowTool = {
+      name: "focus_window",
+      description: "Focus (activate) a window by its title. Uses partial, case-insensitive matching.",
+      parameters: z6.object({
+        title: z6.string().describe("Window title to search for (partial match)")
+      }),
+      async execute(args) {
+        const nut = await getNutJs4();
+        const windows = await nut.getWindows();
+        const lower = args.title.toLowerCase();
+        let target = null;
+        for (const w of windows) {
+          const t = await w.title;
+          if (t.toLowerCase() === lower) {
+            target = w;
+            break;
+          }
+          if (!target && t.toLowerCase().includes(lower)) {
+            target = w;
+          }
+        }
+        if (!target) {
+          return {
+            type: "text",
+            content: `Error: No window found matching "${args.title}"`
+          };
+        }
+        const title = await target.title;
+        await target.focus();
+        return { type: "text", content: `Focused window: "${title}"` };
+      }
+    };
+    windowScreenshotTool = {
+      name: "window_screenshot",
+      description: "Capture a screenshot of a specific window by its title. The coordinate grid shows screen-absolute coordinates (matching mouse_click). Returns a screenshot ID.",
+      parameters: z6.object({
+        title: z6.string().describe("Window title to search for (partial match)")
+      }),
+      async execute(args, ctx) {
+        const { Window, Monitor } = await getNodeScreenshots();
+        const windows = Window.all().filter(
+          (w) => w.title().trim().length > 0
+        );
+        const target = findWindowByTitle(windows, args.title);
+        if (!target) {
+          return {
+            type: "text",
+            content: `Error: No window found matching "${args.title}"`
+          };
+        }
+        const winTitle = target.title();
+        const winX = target.x();
+        const winY = target.y();
+        const image = target.captureImageSync();
+        const physW = image.width;
+        const physH = image.height;
+        const monitor = target.currentMonitor();
+        const scaleFactor = monitor ? monitor.scaleFactor() : 1;
+        const logicalW = Math.round(physW / scaleFactor);
+        const logicalH = Math.round(physH / scaleFactor);
+        const raw = image.toRawSync();
+        const resized = await sharp3(raw, {
+          raw: { width: physW, height: physH, channels: 4 }
+        }).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
+        const cleanBase64 = resized.toString("base64");
+        const id = ctx.screenshots.save(
+          cleanBase64,
+          "image/jpeg",
+          `window: ${winTitle}`
+        );
+        const { image: gridImage, gridRef } = await addCoordinateGrid(resized, logicalW, logicalH, {
+          offsetX: winX,
+          offsetY: winY
+        });
+        const gridBase64 = gridImage.toString("base64");
+        return {
+          type: "image",
+          base64: gridBase64,
+          mimeType: "image/jpeg",
+          screenshotId: id,
+          content: gridRef
+        };
+      }
+    };
+  }
+});
+// src/tools/windows/command.ts
+import { z as z7 } from "zod";
 import { exec } from "child_process";
 var MAX_OUTPUT_LENGTH, runCommandTool;
 var init_command = __esm({
@@ -987,9 +1203,9 @@ var init_command = __esm({
     runCommandTool = {
       name: "run_command",
       description: "Execute a shell command and return its output. Uses PowerShell on Windows.",
-      parameters: z5.object({
-        command: z5.string().describe("The command to execute"),
-        timeout: z5.number().positive().default(3e4).describe("Timeout in milliseconds")
+      parameters: z7.object({
+        command: z7.string().describe("The command to execute"),
+        timeout: z7.number().positive().default(3e4).describe("Timeout in milliseconds")
       }),
       async execute(args) {
         return new Promise((resolve) => {
@@ -1022,7 +1238,7 @@ var init_command = __esm({
 });
 // src/tools/file/read.ts
-import { z as z6 } from "zod";
+import { z as z8 } from "zod";
 import { readFile } from "fs/promises";
 var MAX_FILE_SIZE, fileReadTool;
 var init_read = __esm({
@@ -1032,8 +1248,8 @@ var init_read = __esm({
     fileReadTool = {
       name: "file_read",
       description: "Read the contents of a file at the given path.",
-      parameters: z6.object({
-        path: z6.string().describe("Absolute path to the file")
+      parameters: z8.object({
+        path: z8.string().describe("Absolute path to the file")
       }),
       async execute(args) {
         try {
@@ -1055,7 +1271,7 @@ var init_read = __esm({
 });
 // src/tools/file/write.ts
-import { z as z7 } from "zod";
+import { z as z9 } from "zod";
 import { writeFile, mkdir } from "fs/promises";
 import { dirname } from "path";
 var fileWriteTool;
@@ -1065,9 +1281,9 @@ var init_write = __esm({
     fileWriteTool = {
       name: "file_write",
       description: "Write content to a file at the given path. Creates parent directories if needed.",
-      parameters: z7.object({
-        path: z7.string().describe("Absolute path to the file"),
-        content: z7.string().describe("Content to write")
+      parameters: z9.object({
+        path: z9.string().describe("Absolute path to the file"),
+        content: z9.string().describe("Content to write")
       }),
       async execute(args) {
         try {
@@ -1084,7 +1300,7 @@ var init_write = __esm({
 });
 // src/tools/file/image.ts
-import { z as z8 } from "zod";
+import { z as z10 } from "zod";
 import { readFileSync as readFileSync2, existsSync as existsSync3 } from "fs";
 import { extname } from "path";
 var IMAGE_EXTS, useLocalImageTool;
@@ -1095,9 +1311,9 @@ var init_image = __esm({
     useLocalImageTool = {
       name: "use_local_image",
       description: "Load a local image file and get a screenshot ID for it. Use this to reference local images in your report via [Image:img_X].",
-      parameters: z8.object({
-        path: z8.string().describe("Absolute path to the image file"),
-        label: z8.string().default("local").describe('Label for the image (e.g. "chart", "photo")')
+      parameters: z10.object({
+        path: z10.string().describe("Absolute path to the image file"),
+        label: z10.string().default("local").describe('Label for the image (e.g. "chart", "photo")')
       }),
       async execute(args, ctx) {
         if (!existsSync3(args.path)) {
@@ -1123,7 +1339,7 @@ var init_image = __esm({
 });
 // src/tools/browser/navigate.ts
-import { z as z9 } from "zod";
+import { z as z11 } from "zod";
 var browserNavigateTool;
 var init_navigate = __esm({
   "src/tools/browser/navigate.ts"() {
@@ -1131,8 +1347,8 @@ var init_navigate = __esm({
     browserNavigateTool = {
       name: "browser_navigate",
       description: "Navigate the browser to a URL.",
-      parameters: z9.object({
-        url: z9.string().describe("The URL to navigate to")
+      parameters: z11.object({
+        url: z11.string().describe("The URL to navigate to")
       }),
       async execute(args, ctx) {
         const browser = await ctx.getBrowser();
@@ -1147,7 +1363,7 @@ Page title: ${title}` };
 });
 // src/tools/browser/click.ts
-import { z as z10 } from "zod";
+import { z as z12 } from "zod";
 var browserClickTool;
 var init_click = __esm({
   "src/tools/browser/click.ts"() {
@@ -1155,8 +1371,8 @@ var init_click = __esm({
     browserClickTool = {
       name: "browser_click",
       description: "Click an element on the web page using a CSS selector or text content.",
-      parameters: z10.object({
-        selector: z10.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
+      parameters: z12.object({
+        selector: z12.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
       }),
       async execute(args, ctx) {
         const browser = await ctx.getBrowser();
@@ -1169,7 +1385,7 @@ var init_click = __esm({
 });
 // src/tools/browser/type.ts
-import { z as z11 } from "zod";
+import { z as z13 } from "zod";
 var browserTypeTool;
 var init_type = __esm({
   "src/tools/browser/type.ts"() {
@@ -1177,10 +1393,10 @@ var init_type = __esm({
     browserTypeTool = {
       name: "browser_type",
       description: "Type text into an input field on the web page.",
-      parameters: z11.object({
-        selector: z11.string().describe("CSS selector for the input element"),
-        text: z11.string().describe("Text to type"),
-        clear: z11.boolean().default(true).describe("Whether to clear the field before typing")
+      parameters: z13.object({
+        selector: z13.string().describe("CSS selector for the input element"),
+        text: z13.string().describe("Text to type"),
+        clear: z13.boolean().default(true).describe("Whether to clear the field before typing")
       }),
       async execute(args, ctx) {
         const browser = await ctx.getBrowser();
@@ -1197,7 +1413,7 @@ var init_type = __esm({
 });
 // src/tools/browser/screenshot.ts
-import { z as z12 } from "zod";
+import { z as z14 } from "zod";
 var browserScreenshotTool;
 var init_screenshot2 = __esm({
   "src/tools/browser/screenshot.ts"() {
@@ -1205,8 +1421,8 @@ var init_screenshot2 = __esm({
     browserScreenshotTool = {
       name: "browser_screenshot",
       description: "Take a screenshot of the current browser page. Returns a screenshot ID (e.g. img_2) that you can reference later in report.",
-      parameters: z12.object({
-        fullPage: z12.boolean().default(false).describe("Whether to capture the full scrollable page")
+      parameters: z14.object({
+        fullPage: z14.boolean().default(false).describe("Whether to capture the full scrollable page")
       }),
       async execute(args, ctx) {
         const browser = await ctx.getBrowser();
@@ -1231,7 +1447,7 @@ var init_screenshot2 = __esm({
 });
 // src/tools/browser/content.ts
-import { z as z13 } from "zod";
+import { z as z15 } from "zod";
 var MAX_CONTENT_LENGTH, browserContentTool;
 var init_content = __esm({
   "src/tools/browser/content.ts"() {
@@ -1240,7 +1456,7 @@ var init_content = __esm({
     browserContentTool = {
       name: "browser_content",
       description: "Get the text content of the current web page. Returns visible text, not HTML.",
-      parameters: z13.object({}),
+      parameters: z15.object({}),
       async execute(_args, ctx) {
         const browser = await ctx.getBrowser();
         const page = await browser.getPage();
@@ -1263,7 +1479,7 @@ ${text}`
 });
 // src/tools/browser/scroll.ts
-import { z as z14 } from "zod";
+import { z as z16 } from "zod";
 var browserScrollTool;
 var init_scroll = __esm({
   "src/tools/browser/scroll.ts"() {
@@ -1271,9 +1487,9 @@ var init_scroll = __esm({
     browserScrollTool = {
       name: "browser_scroll",
       description: "Scroll the current web page.",
-      parameters: z14.object({
-        direction: z14.enum(["up", "down"]).describe("Scroll direction"),
-        amount: z14.number().positive().default(500).describe("Pixels to scroll")
+      parameters: z16.object({
+        direction: z16.enum(["up", "down"]).describe("Scroll direction"),
+        amount: z16.number().positive().default(500).describe("Pixels to scroll")
       }),
       async execute(args, ctx) {
         const browser = await ctx.getBrowser();
@@ -1287,7 +1503,7 @@ var init_scroll = __esm({
 });
 // src/tools/control/report.ts
-import { z as z15 } from "zod";
+import { z as z17 } from "zod";
 var reportTool;
 var init_report = __esm({
   "src/tools/control/report.ts"() {
@@ -1295,12 +1511,12 @@ var init_report = __esm({
     reportTool = {
       name: "report",
       description: 'Report progress back to the caller. Call this when the task is completed, when you are blocked, or when you need guidance. Calling this STOPS your execution immediately.\n\nThe content field supports rich document format: mix text with screenshots using [Image:img_1] markers. Example:\n"Here is the current state:\n[Image:img_2]\nThe page shows..."',
-      parameters: z15.object({
-        status: z15.enum(["completed", "blocked", "need_guidance"]).describe(
+      parameters: z17.object({
+        status: z17.enum(["completed", "blocked", "need_guidance"]).describe(
           '"completed" = task done, "blocked" = cannot proceed, "need_guidance" = need a decision'
         ),
-        content: z15.string().describe('Rich report content. Use [Image:img_X] to embed screenshots captured earlier. Example: "Task done.\\n[Image:img_1]\\nThe page shows the result."'),
-        data: z15.unknown().optional().describe("Optional structured data to return")
+        content: z17.string().describe('Rich report content. Use [Image:img_X] to embed screenshots captured earlier. Example: "Task done.\\n[Image:img_1]\\nThe page shows the result."'),
+        data: z17.unknown().optional().describe("Optional structured data to return")
       }),
       async execute(args) {
         return {
@@ -1323,6 +1539,11 @@ function createToolRegistry() {
   registry2.register(mouseScrollTool);
   registry2.register(keyboardTypeTool);
   registry2.register(keyboardPressTool);
+  registry2.register(clipboardTypeTool);
+  registry2.register(switchInputMethodTool);
+  registry2.register(listWindowsTool);
+  registry2.register(focusWindowTool);
+  registry2.register(windowScreenshotTool);
   registry2.register(runCommandTool);
   registry2.register(fileReadTool);
   registry2.register(fileWriteTool);
@@ -1343,6 +1564,8 @@ var init_tools = __esm({
     init_screenshot();
     init_mouse();
     init_keyboard();
+    init_clipboard();
+    init_window();
     init_command();
     init_read();
     init_write();
@@ -1494,19 +1717,19 @@ var init_session_registry = __esm({
 });
 // src/mcp/tools.ts
-import { z as z16 } from "zod";
+import { z as z18 } from "zod";
 function registerMcpTools(server2, registry2) {
   server2.tool(
     "create_session",
     "Create a new automation session with a small LLM agent. Returns a session_id.",
     {
-      api_key: z16.string().optional().describe("LLM API key (or set WINDOWS_USE_API_KEY env)"),
-      base_url: z16.string().optional().describe("OpenAI-compatible base URL (or set WINDOWS_USE_BASE_URL env)"),
-      model: z16.string().optional().describe("Model name (or set WINDOWS_USE_MODEL env)"),
-      cdp_url: z16.string().optional().describe("Chrome CDP URL (default: http://localhost:9222)"),
-      timeout_ms: z16.number().optional().describe("Session inactivity timeout in ms (default: 300000)"),
-      max_steps: z16.number().optional().describe("Max tool-calling steps per instruction (default: 50)"),
-      max_rounds: z16.number().optional().describe("Max instruction rounds per session (default: 20)")
+      api_key: z18.string().optional().describe("LLM API key (or set WINDOWS_USE_API_KEY env)"),
+      base_url: z18.string().optional().describe("OpenAI-compatible base URL (or set WINDOWS_USE_BASE_URL env)"),
+      model: z18.string().optional().describe("Model name (or set WINDOWS_USE_MODEL env)"),
+      cdp_url: z18.string().optional().describe("Chrome CDP URL (default: http://localhost:9222)"),
+      timeout_ms: z18.number().optional().describe("Session inactivity timeout in ms (default: 300000)"),
+      max_steps: z18.number().optional().describe("Max tool-calling steps per instruction (default: 50)"),
+      max_rounds: z18.number().optional().describe("Max instruction rounds per session (default: 20)")
     },
     async (args) => {
       const config = loadConfig({
@@ -1533,8 +1756,8 @@ function registerMcpTools(server2, registry2) {
     "send_instruction",
     "Send a task instruction to the agent in a session. The agent executes it and returns a rich report with text and images.",
     {
-      session_id: z16.string().describe("Session ID from create_session"),
-      instruction: z16.string().describe("What you want the agent to do, in natural language")
+      session_id: z18.string().describe("Session ID from create_session"),
+      instruction: z18.string().describe("What you want the agent to do, in natural language")
     },
     async (args) => {
       const session = registry2.get(args.session_id);
@@ -1584,7 +1807,7 @@ function registerMcpTools(server2, registry2) {
     "done_session",
     "Terminate a session and free all resources.",
     {
-      session_id: z16.string().describe("Session ID to terminate")
+      session_id: z18.string().describe("Session ID to terminate")
     },
     async (args) => {
       await registry2.destroy(args.session_id);