windows-use 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -126,15 +126,18 @@ function buildSystemPrompt() {
126
126
  4. When the task is done, you are blocked, or you need guidance, call \`report\` immediately.
127
127
 
128
128
  ## Reading Screenshots
129
- - Desktop screenshots include a **coordinate grid overlay**. The grid labels show pixel coordinates that directly correspond to \`mouse_click\` and \`mouse_move\` coordinates.
130
- - Use the grid numbers to estimate the (x, y) position of UI elements. For example, if a button appears near the grid label "400" horizontally and "300" vertically, click at approximately (400, 300).
131
- - The bottom-right corner label shows the total screen dimensions.
129
+ - Screenshots include a **coordinate grid overlay** with **numbered blue reference markers** at grid intersections.
130
+ - Each screenshot also returns a **text coordinate table** mapping marker numbers to exact screen coordinates, e.g. \`[1](200,200) [2](400,200)\`.
131
+ - **How to locate elements precisely**: Find the nearest blue numbered marker to your target in the image, look up its exact (x,y) from the coordinate table, then adjust for the offset.
132
+ - Example: A button is just right of marker \`[7]\`. The table says \`[7](600,400)\`. The button is ~50px right \u2192 click at (650, 400).
133
+ - The red edge labels and bottom-right dimension label are also available for reference.
132
134
 
133
135
  ## Tool Selection
134
136
  - **Browser tasks**: Prefer \`browser_*\` tools (they use CSS selectors, more reliable than coordinates). Use \`browser_content\` to find text/elements when you can't locate them visually.
135
137
  - **Desktop/native app tasks**: Use \`screenshot\` + \`mouse_click\`/\`keyboard_*\`. Read coordinates from the grid overlay.
136
138
  - **Terminal tasks**: Prefer \`run_command\` over GUI interactions. It's faster and more reliable.
137
139
  - **Mixed tasks**: You can combine all tool types. For example, use \`run_command\` to launch an app, then \`screenshot\` + mouse to interact with it.
140
+ - **Window management**: Use \`list_windows\` to see all open windows, \`focus_window\` to activate a specific window, and \`window_screenshot\` to capture a specific window (coordinates in the grid are screen-absolute, matching \`mouse_click\`). Focus a window before sending keyboard/mouse input to it.
138
141
 
139
142
  ## Smart Screenshot Strategy
140
143
  - ALWAYS take a screenshot before your first action.
@@ -153,7 +156,12 @@ function buildSystemPrompt() {
153
156
  - **Popups/dialogs**: Handle unexpected dialogs (cookie banners, notifications, confirmations) by dismissing or accepting them, then continue with the original task.
154
157
  - **Dropdowns/menus**: Click to open, then screenshot to see options before selecting.
155
158
  - **Scrolling**: If content is below the fold, scroll down and screenshot. Check both browser_scroll (for web pages) and mouse_scroll (for desktop apps).
156
- - **Text input**: For browser forms, prefer \`browser_type\` with the CSS selector. For desktop apps, click the input field first, then use \`keyboard_type\`.
159
+ - **Text input**:
160
+ - For browser forms, prefer \`browser_type\` with the CSS selector.
161
+ - For desktop apps, click the input field first, then type.
162
+ - Use \`clipboard_type\` (paste via clipboard) when: the text contains non-ASCII characters (Chinese, Japanese, etc.), the current IME might interfere, or you need fast input.
163
+ - Use \`keyboard_type\` (character-by-character) when: you need to trigger per-key events, or for simple ASCII text with English IME active.
164
+ - If \`keyboard_type\` produces garbled text, switch to \`clipboard_type\` or use \`switch_input_method\` to toggle the IME first.
157
165
  - **Coordinate precision**: When clicking small UI elements (buttons, links, checkboxes), aim for their center. If a click misses, adjust coordinates and try once more.
158
166
 
159
167
  ## Error Recovery
@@ -339,11 +347,13 @@ var init_runner = __esm({
339
347
  }
340
348
  if (result.type === "image") {
341
349
  this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: `Screenshot captured (${result.screenshotId})` });
350
+ const textPart = result.content ? `Screenshot captured. ID: ${result.screenshotId}
351
+ ${result.content}` : `Screenshot captured. ID: ${result.screenshotId}`;
342
352
  this.contextManager.append({
343
353
  role: "tool",
344
354
  tool_call_id: toolCall.id,
345
355
  content: [
346
- { type: "text", text: `Screenshot captured. ID: ${result.screenshotId}` },
356
+ { type: "text", text: textPart },
347
357
  {
348
358
  type: "image_url",
349
359
  image_url: {
@@ -725,6 +735,8 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
725
735
  const gridSpacing = options.gridSpacing ?? 100;
726
736
  const labelSpacing = options.labelSpacing ?? 200;
727
737
  const majorSpacing = gridSpacing * 5;
738
+ const offsetX = options.offsetX ?? 0;
739
+ const offsetY = options.offsetY ?? 0;
728
740
  const svgParts = [];
729
741
  for (let x = gridSpacing; x < width; x += gridSpacing) {
730
742
  const isMajor = x % majorSpacing === 0;
@@ -742,8 +754,24 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
742
754
  `<line x1="0" y1="${y}" x2="${width}" y2="${y}" stroke="rgba(255,50,50,${opacity})" stroke-width="${sw}"/>`
743
755
  );
744
756
  }
757
+ const markers = [];
758
+ let markerId = 1;
759
+ for (let y = labelSpacing; y < height; y += labelSpacing) {
760
+ for (let x = labelSpacing; x < width; x += labelSpacing) {
761
+ const screenX = x + offsetX;
762
+ const screenY = y + offsetY;
763
+ markers.push({ id: markerId, screenX, screenY });
764
+ const label = String(markerId);
765
+ const r = label.length > 1 ? 12 : 10;
766
+ svgParts.push(
767
+ `<circle cx="${x}" cy="${y}" r="${r}" fill="rgba(0,110,255,0.85)" stroke="white" stroke-width="1"/>`,
768
+ `<text x="${x}" y="${y + 4}" text-anchor="middle" fill="white" font-size="${label.length > 1 ? 9 : 10}" font-family="Consolas,monospace" font-weight="bold">${label}</text>`
769
+ );
770
+ markerId++;
771
+ }
772
+ }
745
773
  for (let x = labelSpacing; x < width; x += labelSpacing) {
746
- const text = String(x);
774
+ const text = String(x + offsetX);
747
775
  const tw = text.length * 7.5 + 6;
748
776
  svgParts.push(
749
777
  `<rect x="${x - tw / 2}" y="2" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
@@ -751,16 +779,18 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
751
779
  );
752
780
  }
753
781
  for (let y = labelSpacing; y < height; y += labelSpacing) {
754
- const text = String(y);
782
+ const text = String(y + offsetY);
755
783
  const tw = text.length * 7.5 + 6;
756
784
  svgParts.push(
757
785
  `<rect x="2" y="${y - 8}" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
758
786
  `<text x="5" y="${y + 4}" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${text}</text>`
759
787
  );
760
788
  }
789
+ const originText = `${offsetX},${offsetY}`;
790
+ const originTw = originText.length * 7.5 + 6;
761
791
  svgParts.push(
762
- `<rect x="2" y="2" width="22" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
763
- `<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">0,0</text>`
792
+ `<rect x="2" y="2" width="${originTw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
793
+ `<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${originText}</text>`
764
794
  );
765
795
  const dimText = `${width}x${height}`;
766
796
  const dimTw = dimText.length * 7.5 + 6;
@@ -771,7 +801,16 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
771
801
  const svg = Buffer.from(
772
802
  `<svg width="${width}" height="${height}" xmlns="http://www.w3.org/2000/svg">${svgParts.join("")}</svg>`
773
803
  );
774
- return sharp(imageBuffer).composite([{ input: svg, top: 0, left: 0 }]).jpeg({ quality: 70 }).toBuffer();
804
+ const image = await sharp(imageBuffer).composite([{ input: svg, top: 0, left: 0 }]).jpeg({ quality: 70 }).toBuffer();
805
+ const cols = Math.floor((width - 1) / labelSpacing);
806
+ const rows = [];
807
+ for (let i = 0; i < markers.length; i += cols) {
808
+ const row = markers.slice(i, i + cols).map((m) => `[${m.id}](${m.screenX},${m.screenY})`).join(" ");
809
+ rows.push(row);
810
+ }
811
+ const gridRef = `Grid reference points (marker \u2192 screen coordinates for mouse_click):
812
+ ${rows.join("\n")}`;
813
+ return { image, gridRef };
775
814
  }
776
815
  var init_grid_overlay = __esm({
777
816
  "src/tools/windows/grid-overlay.ts"() {
@@ -810,13 +849,14 @@ var init_screenshot = __esm({
810
849
  }).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
811
850
  const cleanBase64 = resized.toString("base64");
812
851
  const id = ctx.screenshots.save(cleanBase64, "image/jpeg", "desktop");
813
- const gridImage = await addCoordinateGrid(resized, logicalW, logicalH);
852
+ const { image: gridImage, gridRef } = await addCoordinateGrid(resized, logicalW, logicalH);
814
853
  const gridBase64 = gridImage.toString("base64");
815
854
  return {
816
855
  type: "image",
817
856
  base64: gridBase64,
818
857
  mimeType: "image/jpeg",
819
- screenshotId: id
858
+ screenshotId: id,
859
+ content: gridRef
820
860
  };
821
861
  }
822
862
  };
@@ -976,8 +1016,184 @@ var init_keyboard = __esm({
976
1016
  }
977
1017
  });
978
1018
 
979
- // src/tools/windows/command.ts
1019
+ // src/tools/windows/clipboard.ts
980
1020
  import { z as z5 } from "zod";
1021
+ async function getNutJs3() {
1022
+ return import("@nut-tree-fork/nut-js");
1023
+ }
1024
+ var clipboardTypeTool, switchInputMethodTool;
1025
+ var init_clipboard = __esm({
1026
+ "src/tools/windows/clipboard.ts"() {
1027
+ "use strict";
1028
+ clipboardTypeTool = {
1029
+ name: "clipboard_type",
1030
+ description: "Type text by copying it to the clipboard and pasting (Ctrl+V). This bypasses input method (IME) issues and is faster than keyboard_type. Use this when the current IME might interfere, or for non-ASCII text (Chinese, Japanese, etc.).",
1031
+ parameters: z5.object({
1032
+ text: z5.string().describe("The text to paste")
1033
+ }),
1034
+ async execute(args) {
1035
+ const nut = await getNutJs3();
1036
+ await nut.clipboard.setContent(args.text);
1037
+ await nut.keyboard.pressKey(nut.Key.LeftControl, nut.Key.V);
1038
+ await nut.keyboard.releaseKey(nut.Key.LeftControl, nut.Key.V);
1039
+ return { type: "text", content: `Pasted: "${args.text}"` };
1040
+ }
1041
+ };
1042
+ switchInputMethodTool = {
1043
+ name: "switch_input_method",
1044
+ description: "Toggle the input method (IME) by pressing Win+Space. Use this before keyboard_type if the current IME is wrong. Take a screenshot afterward to verify the switch.",
1045
+ parameters: z5.object({}),
1046
+ async execute() {
1047
+ const nut = await getNutJs3();
1048
+ await nut.keyboard.pressKey(nut.Key.LeftWin, nut.Key.Space);
1049
+ await nut.keyboard.releaseKey(nut.Key.LeftWin, nut.Key.Space);
1050
+ return { type: "text", content: "Toggled input method (Win+Space)" };
1051
+ }
1052
+ };
1053
+ }
1054
+ });
1055
+
1056
+ // src/tools/windows/window.ts
1057
+ import { z as z6 } from "zod";
1058
+ import sharp3 from "sharp";
1059
+ async function getNutJs4() {
1060
+ return import("@nut-tree-fork/nut-js");
1061
+ }
1062
+ async function getNodeScreenshots() {
1063
+ return import("node-screenshots");
1064
+ }
1065
+ function findWindowByTitle(windows, title) {
1066
+ const lower = title.toLowerCase();
1067
+ const exact = windows.find(
1068
+ (w) => w.title().toLowerCase() === lower
1069
+ );
1070
+ if (exact) return exact;
1071
+ return windows.find(
1072
+ (w) => w.title().toLowerCase().includes(lower)
1073
+ );
1074
+ }
1075
+ var listWindowsTool, focusWindowTool, windowScreenshotTool;
1076
+ var init_window = __esm({
1077
+ "src/tools/windows/window.ts"() {
1078
+ "use strict";
1079
+ init_grid_overlay();
1080
+ listWindowsTool = {
1081
+ name: "list_windows",
1082
+ description: "List all visible windows with their titles, positions, and sizes.",
1083
+ parameters: z6.object({}),
1084
+ async execute() {
1085
+ const { Window } = await getNodeScreenshots();
1086
+ const windows = Window.all();
1087
+ const list = windows.filter((w) => w.title().trim().length > 0).map((w) => ({
1088
+ id: w.id(),
1089
+ title: w.title(),
1090
+ appName: w.appName(),
1091
+ x: w.x(),
1092
+ y: w.y(),
1093
+ width: w.width(),
1094
+ height: w.height(),
1095
+ isMinimized: w.isMinimized(),
1096
+ isFocused: w.isFocused()
1097
+ }));
1098
+ const formatted = list.map(
1099
+ (w) => `[${w.isFocused ? "*" : " "}] "${w.title}" (${w.appName}) \u2014 pos:(${w.x},${w.y}) size:${w.width}x${w.height}${w.isMinimized ? " [minimized]" : ""}`
1100
+ ).join("\n");
1101
+ return {
1102
+ type: "text",
1103
+ content: `Found ${list.length} windows:
1104
+ ${formatted}`
1105
+ };
1106
+ }
1107
+ };
1108
+ focusWindowTool = {
1109
+ name: "focus_window",
1110
+ description: "Focus (activate) a window by its title. Uses partial, case-insensitive matching.",
1111
+ parameters: z6.object({
1112
+ title: z6.string().describe("Window title to search for (partial match)")
1113
+ }),
1114
+ async execute(args) {
1115
+ const nut = await getNutJs4();
1116
+ const windows = await nut.getWindows();
1117
+ const lower = args.title.toLowerCase();
1118
+ let target = null;
1119
+ for (const w of windows) {
1120
+ const t = await w.title;
1121
+ if (t.toLowerCase() === lower) {
1122
+ target = w;
1123
+ break;
1124
+ }
1125
+ if (!target && t.toLowerCase().includes(lower)) {
1126
+ target = w;
1127
+ }
1128
+ }
1129
+ if (!target) {
1130
+ return {
1131
+ type: "text",
1132
+ content: `Error: No window found matching "${args.title}"`
1133
+ };
1134
+ }
1135
+ const title = await target.title;
1136
+ await target.focus();
1137
+ return { type: "text", content: `Focused window: "${title}"` };
1138
+ }
1139
+ };
1140
+ windowScreenshotTool = {
1141
+ name: "window_screenshot",
1142
+ description: "Capture a screenshot of a specific window by its title. The coordinate grid shows screen-absolute coordinates (matching mouse_click). Returns a screenshot ID.",
1143
+ parameters: z6.object({
1144
+ title: z6.string().describe("Window title to search for (partial match)")
1145
+ }),
1146
+ async execute(args, ctx) {
1147
+ const { Window, Monitor } = await getNodeScreenshots();
1148
+ const windows = Window.all().filter(
1149
+ (w) => w.title().trim().length > 0
1150
+ );
1151
+ const target = findWindowByTitle(windows, args.title);
1152
+ if (!target) {
1153
+ return {
1154
+ type: "text",
1155
+ content: `Error: No window found matching "${args.title}"`
1156
+ };
1157
+ }
1158
+ const winTitle = target.title();
1159
+ const winX = target.x();
1160
+ const winY = target.y();
1161
+ const image = target.captureImageSync();
1162
+ const physW = image.width;
1163
+ const physH = image.height;
1164
+ const monitor = target.currentMonitor();
1165
+ const scaleFactor = monitor ? monitor.scaleFactor() : 1;
1166
+ const logicalW = Math.round(physW / scaleFactor);
1167
+ const logicalH = Math.round(physH / scaleFactor);
1168
+ const raw = image.toRawSync();
1169
+ const resized = await sharp3(raw, {
1170
+ raw: { width: physW, height: physH, channels: 4 }
1171
+ }).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
1172
+ const cleanBase64 = resized.toString("base64");
1173
+ const id = ctx.screenshots.save(
1174
+ cleanBase64,
1175
+ "image/jpeg",
1176
+ `window: ${winTitle}`
1177
+ );
1178
+ const { image: gridImage, gridRef } = await addCoordinateGrid(resized, logicalW, logicalH, {
1179
+ offsetX: winX,
1180
+ offsetY: winY
1181
+ });
1182
+ const gridBase64 = gridImage.toString("base64");
1183
+ return {
1184
+ type: "image",
1185
+ base64: gridBase64,
1186
+ mimeType: "image/jpeg",
1187
+ screenshotId: id,
1188
+ content: gridRef
1189
+ };
1190
+ }
1191
+ };
1192
+ }
1193
+ });
1194
+
1195
+ // src/tools/windows/command.ts
1196
+ import { z as z7 } from "zod";
981
1197
  import { exec } from "child_process";
982
1198
  var MAX_OUTPUT_LENGTH, runCommandTool;
983
1199
  var init_command = __esm({
@@ -987,9 +1203,9 @@ var init_command = __esm({
987
1203
  runCommandTool = {
988
1204
  name: "run_command",
989
1205
  description: "Execute a shell command and return its output. Uses PowerShell on Windows.",
990
- parameters: z5.object({
991
- command: z5.string().describe("The command to execute"),
992
- timeout: z5.number().positive().default(3e4).describe("Timeout in milliseconds")
1206
+ parameters: z7.object({
1207
+ command: z7.string().describe("The command to execute"),
1208
+ timeout: z7.number().positive().default(3e4).describe("Timeout in milliseconds")
993
1209
  }),
994
1210
  async execute(args) {
995
1211
  return new Promise((resolve) => {
@@ -1022,7 +1238,7 @@ var init_command = __esm({
1022
1238
  });
1023
1239
 
1024
1240
  // src/tools/file/read.ts
1025
- import { z as z6 } from "zod";
1241
+ import { z as z8 } from "zod";
1026
1242
  import { readFile } from "fs/promises";
1027
1243
  var MAX_FILE_SIZE, fileReadTool;
1028
1244
  var init_read = __esm({
@@ -1032,8 +1248,8 @@ var init_read = __esm({
1032
1248
  fileReadTool = {
1033
1249
  name: "file_read",
1034
1250
  description: "Read the contents of a file at the given path.",
1035
- parameters: z6.object({
1036
- path: z6.string().describe("Absolute path to the file")
1251
+ parameters: z8.object({
1252
+ path: z8.string().describe("Absolute path to the file")
1037
1253
  }),
1038
1254
  async execute(args) {
1039
1255
  try {
@@ -1055,7 +1271,7 @@ var init_read = __esm({
1055
1271
  });
1056
1272
 
1057
1273
  // src/tools/file/write.ts
1058
- import { z as z7 } from "zod";
1274
+ import { z as z9 } from "zod";
1059
1275
  import { writeFile, mkdir } from "fs/promises";
1060
1276
  import { dirname } from "path";
1061
1277
  var fileWriteTool;
@@ -1065,9 +1281,9 @@ var init_write = __esm({
1065
1281
  fileWriteTool = {
1066
1282
  name: "file_write",
1067
1283
  description: "Write content to a file at the given path. Creates parent directories if needed.",
1068
- parameters: z7.object({
1069
- path: z7.string().describe("Absolute path to the file"),
1070
- content: z7.string().describe("Content to write")
1284
+ parameters: z9.object({
1285
+ path: z9.string().describe("Absolute path to the file"),
1286
+ content: z9.string().describe("Content to write")
1071
1287
  }),
1072
1288
  async execute(args) {
1073
1289
  try {
@@ -1084,7 +1300,7 @@ var init_write = __esm({
1084
1300
  });
1085
1301
 
1086
1302
  // src/tools/file/image.ts
1087
- import { z as z8 } from "zod";
1303
+ import { z as z10 } from "zod";
1088
1304
  import { readFileSync as readFileSync2, existsSync as existsSync3 } from "fs";
1089
1305
  import { extname } from "path";
1090
1306
  var IMAGE_EXTS, useLocalImageTool;
@@ -1095,9 +1311,9 @@ var init_image = __esm({
1095
1311
  useLocalImageTool = {
1096
1312
  name: "use_local_image",
1097
1313
  description: "Load a local image file and get a screenshot ID for it. Use this to reference local images in your report via [Image:img_X].",
1098
- parameters: z8.object({
1099
- path: z8.string().describe("Absolute path to the image file"),
1100
- label: z8.string().default("local").describe('Label for the image (e.g. "chart", "photo")')
1314
+ parameters: z10.object({
1315
+ path: z10.string().describe("Absolute path to the image file"),
1316
+ label: z10.string().default("local").describe('Label for the image (e.g. "chart", "photo")')
1101
1317
  }),
1102
1318
  async execute(args, ctx) {
1103
1319
  if (!existsSync3(args.path)) {
@@ -1123,7 +1339,7 @@ var init_image = __esm({
1123
1339
  });
1124
1340
 
1125
1341
  // src/tools/browser/navigate.ts
1126
- import { z as z9 } from "zod";
1342
+ import { z as z11 } from "zod";
1127
1343
  var browserNavigateTool;
1128
1344
  var init_navigate = __esm({
1129
1345
  "src/tools/browser/navigate.ts"() {
@@ -1131,8 +1347,8 @@ var init_navigate = __esm({
1131
1347
  browserNavigateTool = {
1132
1348
  name: "browser_navigate",
1133
1349
  description: "Navigate the browser to a URL.",
1134
- parameters: z9.object({
1135
- url: z9.string().describe("The URL to navigate to")
1350
+ parameters: z11.object({
1351
+ url: z11.string().describe("The URL to navigate to")
1136
1352
  }),
1137
1353
  async execute(args, ctx) {
1138
1354
  const browser = await ctx.getBrowser();
@@ -1147,7 +1363,7 @@ Page title: ${title}` };
1147
1363
  });
1148
1364
 
1149
1365
  // src/tools/browser/click.ts
1150
- import { z as z10 } from "zod";
1366
+ import { z as z12 } from "zod";
1151
1367
  var browserClickTool;
1152
1368
  var init_click = __esm({
1153
1369
  "src/tools/browser/click.ts"() {
@@ -1155,8 +1371,8 @@ var init_click = __esm({
1155
1371
  browserClickTool = {
1156
1372
  name: "browser_click",
1157
1373
  description: "Click an element on the web page using a CSS selector or text content.",
1158
- parameters: z10.object({
1159
- selector: z10.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
1374
+ parameters: z12.object({
1375
+ selector: z12.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
1160
1376
  }),
1161
1377
  async execute(args, ctx) {
1162
1378
  const browser = await ctx.getBrowser();
@@ -1169,7 +1385,7 @@ var init_click = __esm({
1169
1385
  });
1170
1386
 
1171
1387
  // src/tools/browser/type.ts
1172
- import { z as z11 } from "zod";
1388
+ import { z as z13 } from "zod";
1173
1389
  var browserTypeTool;
1174
1390
  var init_type = __esm({
1175
1391
  "src/tools/browser/type.ts"() {
@@ -1177,10 +1393,10 @@ var init_type = __esm({
1177
1393
  browserTypeTool = {
1178
1394
  name: "browser_type",
1179
1395
  description: "Type text into an input field on the web page.",
1180
- parameters: z11.object({
1181
- selector: z11.string().describe("CSS selector for the input element"),
1182
- text: z11.string().describe("Text to type"),
1183
- clear: z11.boolean().default(true).describe("Whether to clear the field before typing")
1396
+ parameters: z13.object({
1397
+ selector: z13.string().describe("CSS selector for the input element"),
1398
+ text: z13.string().describe("Text to type"),
1399
+ clear: z13.boolean().default(true).describe("Whether to clear the field before typing")
1184
1400
  }),
1185
1401
  async execute(args, ctx) {
1186
1402
  const browser = await ctx.getBrowser();
@@ -1197,7 +1413,7 @@ var init_type = __esm({
1197
1413
  });
1198
1414
 
1199
1415
  // src/tools/browser/screenshot.ts
1200
- import { z as z12 } from "zod";
1416
+ import { z as z14 } from "zod";
1201
1417
  var browserScreenshotTool;
1202
1418
  var init_screenshot2 = __esm({
1203
1419
  "src/tools/browser/screenshot.ts"() {
@@ -1205,8 +1421,8 @@ var init_screenshot2 = __esm({
1205
1421
  browserScreenshotTool = {
1206
1422
  name: "browser_screenshot",
1207
1423
  description: "Take a screenshot of the current browser page. Returns a screenshot ID (e.g. img_2) that you can reference later in report.",
1208
- parameters: z12.object({
1209
- fullPage: z12.boolean().default(false).describe("Whether to capture the full scrollable page")
1424
+ parameters: z14.object({
1425
+ fullPage: z14.boolean().default(false).describe("Whether to capture the full scrollable page")
1210
1426
  }),
1211
1427
  async execute(args, ctx) {
1212
1428
  const browser = await ctx.getBrowser();
@@ -1231,7 +1447,7 @@ var init_screenshot2 = __esm({
1231
1447
  });
1232
1448
 
1233
1449
  // src/tools/browser/content.ts
1234
- import { z as z13 } from "zod";
1450
+ import { z as z15 } from "zod";
1235
1451
  var MAX_CONTENT_LENGTH, browserContentTool;
1236
1452
  var init_content = __esm({
1237
1453
  "src/tools/browser/content.ts"() {
@@ -1240,7 +1456,7 @@ var init_content = __esm({
1240
1456
  browserContentTool = {
1241
1457
  name: "browser_content",
1242
1458
  description: "Get the text content of the current web page. Returns visible text, not HTML.",
1243
- parameters: z13.object({}),
1459
+ parameters: z15.object({}),
1244
1460
  async execute(_args, ctx) {
1245
1461
  const browser = await ctx.getBrowser();
1246
1462
  const page = await browser.getPage();
@@ -1263,7 +1479,7 @@ ${text}`
1263
1479
  });
1264
1480
 
1265
1481
  // src/tools/browser/scroll.ts
1266
- import { z as z14 } from "zod";
1482
+ import { z as z16 } from "zod";
1267
1483
  var browserScrollTool;
1268
1484
  var init_scroll = __esm({
1269
1485
  "src/tools/browser/scroll.ts"() {
@@ -1271,9 +1487,9 @@ var init_scroll = __esm({
1271
1487
  browserScrollTool = {
1272
1488
  name: "browser_scroll",
1273
1489
  description: "Scroll the current web page.",
1274
- parameters: z14.object({
1275
- direction: z14.enum(["up", "down"]).describe("Scroll direction"),
1276
- amount: z14.number().positive().default(500).describe("Pixels to scroll")
1490
+ parameters: z16.object({
1491
+ direction: z16.enum(["up", "down"]).describe("Scroll direction"),
1492
+ amount: z16.number().positive().default(500).describe("Pixels to scroll")
1277
1493
  }),
1278
1494
  async execute(args, ctx) {
1279
1495
  const browser = await ctx.getBrowser();
@@ -1287,7 +1503,7 @@ var init_scroll = __esm({
1287
1503
  });
1288
1504
 
1289
1505
  // src/tools/control/report.ts
1290
- import { z as z15 } from "zod";
1506
+ import { z as z17 } from "zod";
1291
1507
  var reportTool;
1292
1508
  var init_report = __esm({
1293
1509
  "src/tools/control/report.ts"() {
@@ -1295,12 +1511,12 @@ var init_report = __esm({
1295
1511
  reportTool = {
1296
1512
  name: "report",
1297
1513
  description: 'Report progress back to the caller. Call this when the task is completed, when you are blocked, or when you need guidance. Calling this STOPS your execution immediately.\n\nThe content field supports rich document format: mix text with screenshots using [Image:img_1] markers. Example:\n"Here is the current state:\n[Image:img_2]\nThe page shows..."',
1298
- parameters: z15.object({
1299
- status: z15.enum(["completed", "blocked", "need_guidance"]).describe(
1514
+ parameters: z17.object({
1515
+ status: z17.enum(["completed", "blocked", "need_guidance"]).describe(
1300
1516
  '"completed" = task done, "blocked" = cannot proceed, "need_guidance" = need a decision'
1301
1517
  ),
1302
- content: z15.string().describe('Rich report content. Use [Image:img_X] to embed screenshots captured earlier. Example: "Task done.\\n[Image:img_1]\\nThe page shows the result."'),
1303
- data: z15.unknown().optional().describe("Optional structured data to return")
1518
+ content: z17.string().describe('Rich report content. Use [Image:img_X] to embed screenshots captured earlier. Example: "Task done.\\n[Image:img_1]\\nThe page shows the result."'),
1519
+ data: z17.unknown().optional().describe("Optional structured data to return")
1304
1520
  }),
1305
1521
  async execute(args) {
1306
1522
  return {
@@ -1323,6 +1539,11 @@ function createToolRegistry() {
1323
1539
  registry2.register(mouseScrollTool);
1324
1540
  registry2.register(keyboardTypeTool);
1325
1541
  registry2.register(keyboardPressTool);
1542
+ registry2.register(clipboardTypeTool);
1543
+ registry2.register(switchInputMethodTool);
1544
+ registry2.register(listWindowsTool);
1545
+ registry2.register(focusWindowTool);
1546
+ registry2.register(windowScreenshotTool);
1326
1547
  registry2.register(runCommandTool);
1327
1548
  registry2.register(fileReadTool);
1328
1549
  registry2.register(fileWriteTool);
@@ -1343,6 +1564,8 @@ var init_tools = __esm({
1343
1564
  init_screenshot();
1344
1565
  init_mouse();
1345
1566
  init_keyboard();
1567
+ init_clipboard();
1568
+ init_window();
1346
1569
  init_command();
1347
1570
  init_read();
1348
1571
  init_write();
@@ -1494,19 +1717,19 @@ var init_session_registry = __esm({
1494
1717
  });
1495
1718
 
1496
1719
  // src/mcp/tools.ts
1497
- import { z as z16 } from "zod";
1720
+ import { z as z18 } from "zod";
1498
1721
  function registerMcpTools(server2, registry2) {
1499
1722
  server2.tool(
1500
1723
  "create_session",
1501
1724
  "Create a new automation session with a small LLM agent. Returns a session_id.",
1502
1725
  {
1503
- api_key: z16.string().optional().describe("LLM API key (or set WINDOWS_USE_API_KEY env)"),
1504
- base_url: z16.string().optional().describe("OpenAI-compatible base URL (or set WINDOWS_USE_BASE_URL env)"),
1505
- model: z16.string().optional().describe("Model name (or set WINDOWS_USE_MODEL env)"),
1506
- cdp_url: z16.string().optional().describe("Chrome CDP URL (default: http://localhost:9222)"),
1507
- timeout_ms: z16.number().optional().describe("Session inactivity timeout in ms (default: 300000)"),
1508
- max_steps: z16.number().optional().describe("Max tool-calling steps per instruction (default: 50)"),
1509
- max_rounds: z16.number().optional().describe("Max instruction rounds per session (default: 20)")
1726
+ api_key: z18.string().optional().describe("LLM API key (or set WINDOWS_USE_API_KEY env)"),
1727
+ base_url: z18.string().optional().describe("OpenAI-compatible base URL (or set WINDOWS_USE_BASE_URL env)"),
1728
+ model: z18.string().optional().describe("Model name (or set WINDOWS_USE_MODEL env)"),
1729
+ cdp_url: z18.string().optional().describe("Chrome CDP URL (default: http://localhost:9222)"),
1730
+ timeout_ms: z18.number().optional().describe("Session inactivity timeout in ms (default: 300000)"),
1731
+ max_steps: z18.number().optional().describe("Max tool-calling steps per instruction (default: 50)"),
1732
+ max_rounds: z18.number().optional().describe("Max instruction rounds per session (default: 20)")
1510
1733
  },
1511
1734
  async (args) => {
1512
1735
  const config = loadConfig({
@@ -1533,8 +1756,8 @@ function registerMcpTools(server2, registry2) {
1533
1756
  "send_instruction",
1534
1757
  "Send a task instruction to the agent in a session. The agent executes it and returns a rich report with text and images.",
1535
1758
  {
1536
- session_id: z16.string().describe("Session ID from create_session"),
1537
- instruction: z16.string().describe("What you want the agent to do, in natural language")
1759
+ session_id: z18.string().describe("Session ID from create_session"),
1760
+ instruction: z18.string().describe("What you want the agent to do, in natural language")
1538
1761
  },
1539
1762
  async (args) => {
1540
1763
  const session = registry2.get(args.session_id);
@@ -1584,7 +1807,7 @@ function registerMcpTools(server2, registry2) {
1584
1807
  "done_session",
1585
1808
  "Terminate a session and free all resources.",
1586
1809
  {
1587
- session_id: z16.string().describe("Session ID to terminate")
1810
+ session_id: z18.string().describe("Session ID to terminate")
1588
1811
  },
1589
1812
  async (args) => {
1590
1813
  await registry2.destroy(args.session_id);