windows-use 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -107,6 +107,7 @@ type ToolResult = {
107
107
  base64: string;
108
108
  mimeType: 'image/png' | 'image/jpeg';
109
109
  screenshotId: string;
110
+ content?: string;
110
111
  } | {
111
112
  type: 'report';
112
113
  status: 'completed' | 'blocked' | 'need_guidance';
package/dist/index.js CHANGED
@@ -97,15 +97,18 @@ function buildSystemPrompt() {
97
97
  4. When the task is done, you are blocked, or you need guidance, call \`report\` immediately.
98
98
 
99
99
  ## Reading Screenshots
100
- - Desktop screenshots include a **coordinate grid overlay**. The grid labels show pixel coordinates that directly correspond to \`mouse_click\` and \`mouse_move\` coordinates.
101
- - Use the grid numbers to estimate the (x, y) position of UI elements. For example, if a button appears near the grid label "400" horizontally and "300" vertically, click at approximately (400, 300).
102
- - The bottom-right corner label shows the total screen dimensions.
100
+ - Screenshots include a **coordinate grid overlay** with **numbered blue reference markers** at grid intersections.
101
+ - Each screenshot also returns a **text coordinate table** mapping marker numbers to exact screen coordinates, e.g. \`[1](200,200) [2](400,200)\`.
102
+ - **How to locate elements precisely**: Find the nearest blue numbered marker to your target in the image, look up its exact (x,y) from the coordinate table, then adjust for the offset.
103
+ - Example: A button is just right of marker \`[7]\`. The table says \`[7](600,400)\`. The button is ~50px right \u2192 click at (650, 400).
104
+ - The red edge labels and bottom-right dimension label are also available for reference.
103
105
 
104
106
  ## Tool Selection
105
107
  - **Browser tasks**: Prefer \`browser_*\` tools (they use CSS selectors, more reliable than coordinates). Use \`browser_content\` to find text/elements when you can't locate them visually.
106
108
  - **Desktop/native app tasks**: Use \`screenshot\` + \`mouse_click\`/\`keyboard_*\`. Read coordinates from the grid overlay.
107
109
  - **Terminal tasks**: Prefer \`run_command\` over GUI interactions. It's faster and more reliable.
108
110
  - **Mixed tasks**: You can combine all tool types. For example, use \`run_command\` to launch an app, then \`screenshot\` + mouse to interact with it.
111
+ - **Window management**: Use \`list_windows\` to see all open windows, \`focus_window\` to activate a specific window, and \`window_screenshot\` to capture a specific window (coordinates in the grid are screen-absolute, matching \`mouse_click\`). Focus a window before sending keyboard/mouse input to it.
109
112
 
110
113
  ## Smart Screenshot Strategy
111
114
  - ALWAYS take a screenshot before your first action.
@@ -124,7 +127,12 @@ function buildSystemPrompt() {
124
127
  - **Popups/dialogs**: Handle unexpected dialogs (cookie banners, notifications, confirmations) by dismissing or accepting them, then continue with the original task.
125
128
  - **Dropdowns/menus**: Click to open, then screenshot to see options before selecting.
126
129
  - **Scrolling**: If content is below the fold, scroll down and screenshot. Check both browser_scroll (for web pages) and mouse_scroll (for desktop apps).
127
- - **Text input**: For browser forms, prefer \`browser_type\` with the CSS selector. For desktop apps, click the input field first, then use \`keyboard_type\`.
130
+ - **Text input**:
131
+ - For browser forms, prefer \`browser_type\` with the CSS selector.
132
+ - For desktop apps, click the input field first, then type.
133
+ - Use \`clipboard_type\` (paste via clipboard) when: the text contains non-ASCII characters (Chinese, Japanese, etc.), the current IME might interfere, or you need fast input.
134
+ - Use \`keyboard_type\` (character-by-character) when: you need to trigger per-key events, or for simple ASCII text with English IME active.
135
+ - If \`keyboard_type\` produces garbled text, switch to \`clipboard_type\` or use \`switch_input_method\` to toggle the IME first.
128
136
  - **Coordinate precision**: When clicking small UI elements (buttons, links, checkboxes), aim for their center. If a click misses, adjust coordinates and try once more.
129
137
 
130
138
  ## Error Recovery
@@ -300,11 +308,13 @@ var AgentRunner = class {
300
308
  }
301
309
  if (result.type === "image") {
302
310
  this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: `Screenshot captured (${result.screenshotId})` });
311
+ const textPart = result.content ? `Screenshot captured. ID: ${result.screenshotId}
312
+ ${result.content}` : `Screenshot captured. ID: ${result.screenshotId}`;
303
313
  this.contextManager.append({
304
314
  role: "tool",
305
315
  tool_call_id: toolCall.id,
306
316
  content: [
307
- { type: "text", text: `Screenshot captured. ID: ${result.screenshotId}` },
317
+ { type: "text", text: textPart },
308
318
  {
309
319
  type: "image_url",
310
320
  image_url: {
@@ -670,6 +680,8 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
670
680
  const gridSpacing = options.gridSpacing ?? 100;
671
681
  const labelSpacing = options.labelSpacing ?? 200;
672
682
  const majorSpacing = gridSpacing * 5;
683
+ const offsetX = options.offsetX ?? 0;
684
+ const offsetY = options.offsetY ?? 0;
673
685
  const svgParts = [];
674
686
  for (let x = gridSpacing; x < width; x += gridSpacing) {
675
687
  const isMajor = x % majorSpacing === 0;
@@ -687,8 +699,24 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
687
699
  `<line x1="0" y1="${y}" x2="${width}" y2="${y}" stroke="rgba(255,50,50,${opacity})" stroke-width="${sw}"/>`
688
700
  );
689
701
  }
702
+ const markers = [];
703
+ let markerId = 1;
704
+ for (let y = labelSpacing; y < height; y += labelSpacing) {
705
+ for (let x = labelSpacing; x < width; x += labelSpacing) {
706
+ const screenX = x + offsetX;
707
+ const screenY = y + offsetY;
708
+ markers.push({ id: markerId, screenX, screenY });
709
+ const label = String(markerId);
710
+ const r = label.length > 1 ? 12 : 10;
711
+ svgParts.push(
712
+ `<circle cx="${x}" cy="${y}" r="${r}" fill="rgba(0,110,255,0.85)" stroke="white" stroke-width="1"/>`,
713
+ `<text x="${x}" y="${y + 4}" text-anchor="middle" fill="white" font-size="${label.length > 1 ? 9 : 10}" font-family="Consolas,monospace" font-weight="bold">${label}</text>`
714
+ );
715
+ markerId++;
716
+ }
717
+ }
690
718
  for (let x = labelSpacing; x < width; x += labelSpacing) {
691
- const text = String(x);
719
+ const text = String(x + offsetX);
692
720
  const tw = text.length * 7.5 + 6;
693
721
  svgParts.push(
694
722
  `<rect x="${x - tw / 2}" y="2" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
@@ -696,16 +724,18 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
696
724
  );
697
725
  }
698
726
  for (let y = labelSpacing; y < height; y += labelSpacing) {
699
- const text = String(y);
727
+ const text = String(y + offsetY);
700
728
  const tw = text.length * 7.5 + 6;
701
729
  svgParts.push(
702
730
  `<rect x="2" y="${y - 8}" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
703
731
  `<text x="5" y="${y + 4}" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${text}</text>`
704
732
  );
705
733
  }
734
+ const originText = `${offsetX},${offsetY}`;
735
+ const originTw = originText.length * 7.5 + 6;
706
736
  svgParts.push(
707
- `<rect x="2" y="2" width="22" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
708
- `<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">0,0</text>`
737
+ `<rect x="2" y="2" width="${originTw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
738
+ `<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${originText}</text>`
709
739
  );
710
740
  const dimText = `${width}x${height}`;
711
741
  const dimTw = dimText.length * 7.5 + 6;
@@ -716,7 +746,16 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
716
746
  const svg = Buffer.from(
717
747
  `<svg width="${width}" height="${height}" xmlns="http://www.w3.org/2000/svg">${svgParts.join("")}</svg>`
718
748
  );
719
- return sharp(imageBuffer).composite([{ input: svg, top: 0, left: 0 }]).jpeg({ quality: 70 }).toBuffer();
749
+ const image = await sharp(imageBuffer).composite([{ input: svg, top: 0, left: 0 }]).jpeg({ quality: 70 }).toBuffer();
750
+ const cols = Math.floor((width - 1) / labelSpacing);
751
+ const rows = [];
752
+ for (let i = 0; i < markers.length; i += cols) {
753
+ const row = markers.slice(i, i + cols).map((m) => `[${m.id}](${m.screenX},${m.screenY})`).join(" ");
754
+ rows.push(row);
755
+ }
756
+ const gridRef = `Grid reference points (marker \u2192 screen coordinates for mouse_click):
757
+ ${rows.join("\n")}`;
758
+ return { image, gridRef };
720
759
  }
721
760
 
722
761
  // src/tools/windows/screenshot.ts
@@ -743,13 +782,14 @@ var screenshotTool = {
743
782
  }).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
744
783
  const cleanBase64 = resized.toString("base64");
745
784
  const id = ctx.screenshots.save(cleanBase64, "image/jpeg", "desktop");
746
- const gridImage = await addCoordinateGrid(resized, logicalW, logicalH);
785
+ const { image: gridImage, gridRef } = await addCoordinateGrid(resized, logicalW, logicalH);
747
786
  const gridBase64 = gridImage.toString("base64");
748
787
  return {
749
788
  type: "image",
750
789
  base64: gridBase64,
751
790
  mimeType: "image/jpeg",
752
- screenshotId: id
791
+ screenshotId: id,
792
+ content: gridRef
753
793
  };
754
794
  }
755
795
  };
@@ -895,16 +935,179 @@ var keyboardPressTool = {
895
935
  }
896
936
  };
897
937
 
898
- // src/tools/windows/command.ts
938
+ // src/tools/windows/clipboard.ts
899
939
  import { z as z5 } from "zod";
940
+ async function getNutJs3() {
941
+ return import("@nut-tree-fork/nut-js");
942
+ }
943
+ var clipboardTypeTool = {
944
+ name: "clipboard_type",
945
+ description: "Type text by copying it to the clipboard and pasting (Ctrl+V). This bypasses input method (IME) issues and is faster than keyboard_type. Use this when the current IME might interfere, or for non-ASCII text (Chinese, Japanese, etc.).",
946
+ parameters: z5.object({
947
+ text: z5.string().describe("The text to paste")
948
+ }),
949
+ async execute(args) {
950
+ const nut = await getNutJs3();
951
+ await nut.clipboard.setContent(args.text);
952
+ await nut.keyboard.pressKey(nut.Key.LeftControl, nut.Key.V);
953
+ await nut.keyboard.releaseKey(nut.Key.LeftControl, nut.Key.V);
954
+ return { type: "text", content: `Pasted: "${args.text}"` };
955
+ }
956
+ };
957
+ var switchInputMethodTool = {
958
+ name: "switch_input_method",
959
+ description: "Toggle the input method (IME) by pressing Win+Space. Use this before keyboard_type if the current IME is wrong. Take a screenshot afterward to verify the switch.",
960
+ parameters: z5.object({}),
961
+ async execute() {
962
+ const nut = await getNutJs3();
963
+ await nut.keyboard.pressKey(nut.Key.LeftWin, nut.Key.Space);
964
+ await nut.keyboard.releaseKey(nut.Key.LeftWin, nut.Key.Space);
965
+ return { type: "text", content: "Toggled input method (Win+Space)" };
966
+ }
967
+ };
968
+
969
+ // src/tools/windows/window.ts
970
+ import { z as z6 } from "zod";
971
+ import sharp3 from "sharp";
972
+ async function getNutJs4() {
973
+ return import("@nut-tree-fork/nut-js");
974
+ }
975
+ async function getNodeScreenshots() {
976
+ return import("node-screenshots");
977
+ }
978
+ function findWindowByTitle(windows, title) {
979
+ const lower = title.toLowerCase();
980
+ const exact = windows.find(
981
+ (w) => w.title().toLowerCase() === lower
982
+ );
983
+ if (exact) return exact;
984
+ return windows.find(
985
+ (w) => w.title().toLowerCase().includes(lower)
986
+ );
987
+ }
988
+ var listWindowsTool = {
989
+ name: "list_windows",
990
+ description: "List all visible windows with their titles, positions, and sizes.",
991
+ parameters: z6.object({}),
992
+ async execute() {
993
+ const { Window } = await getNodeScreenshots();
994
+ const windows = Window.all();
995
+ const list = windows.filter((w) => w.title().trim().length > 0).map((w) => ({
996
+ id: w.id(),
997
+ title: w.title(),
998
+ appName: w.appName(),
999
+ x: w.x(),
1000
+ y: w.y(),
1001
+ width: w.width(),
1002
+ height: w.height(),
1003
+ isMinimized: w.isMinimized(),
1004
+ isFocused: w.isFocused()
1005
+ }));
1006
+ const formatted = list.map(
1007
+ (w) => `[${w.isFocused ? "*" : " "}] "${w.title}" (${w.appName}) \u2014 pos:(${w.x},${w.y}) size:${w.width}x${w.height}${w.isMinimized ? " [minimized]" : ""}`
1008
+ ).join("\n");
1009
+ return {
1010
+ type: "text",
1011
+ content: `Found ${list.length} windows:
1012
+ ${formatted}`
1013
+ };
1014
+ }
1015
+ };
1016
+ var focusWindowTool = {
1017
+ name: "focus_window",
1018
+ description: "Focus (activate) a window by its title. Uses partial, case-insensitive matching.",
1019
+ parameters: z6.object({
1020
+ title: z6.string().describe("Window title to search for (partial match)")
1021
+ }),
1022
+ async execute(args) {
1023
+ const nut = await getNutJs4();
1024
+ const windows = await nut.getWindows();
1025
+ const lower = args.title.toLowerCase();
1026
+ let target = null;
1027
+ for (const w of windows) {
1028
+ const t = await w.title;
1029
+ if (t.toLowerCase() === lower) {
1030
+ target = w;
1031
+ break;
1032
+ }
1033
+ if (!target && t.toLowerCase().includes(lower)) {
1034
+ target = w;
1035
+ }
1036
+ }
1037
+ if (!target) {
1038
+ return {
1039
+ type: "text",
1040
+ content: `Error: No window found matching "${args.title}"`
1041
+ };
1042
+ }
1043
+ const title = await target.title;
1044
+ await target.focus();
1045
+ return { type: "text", content: `Focused window: "${title}"` };
1046
+ }
1047
+ };
1048
+ var windowScreenshotTool = {
1049
+ name: "window_screenshot",
1050
+ description: "Capture a screenshot of a specific window by its title. The coordinate grid shows screen-absolute coordinates (matching mouse_click). Returns a screenshot ID.",
1051
+ parameters: z6.object({
1052
+ title: z6.string().describe("Window title to search for (partial match)")
1053
+ }),
1054
+ async execute(args, ctx) {
1055
+ const { Window, Monitor } = await getNodeScreenshots();
1056
+ const windows = Window.all().filter(
1057
+ (w) => w.title().trim().length > 0
1058
+ );
1059
+ const target = findWindowByTitle(windows, args.title);
1060
+ if (!target) {
1061
+ return {
1062
+ type: "text",
1063
+ content: `Error: No window found matching "${args.title}"`
1064
+ };
1065
+ }
1066
+ const winTitle = target.title();
1067
+ const winX = target.x();
1068
+ const winY = target.y();
1069
+ const image = target.captureImageSync();
1070
+ const physW = image.width;
1071
+ const physH = image.height;
1072
+ const monitor = target.currentMonitor();
1073
+ const scaleFactor = monitor ? monitor.scaleFactor() : 1;
1074
+ const logicalW = Math.round(physW / scaleFactor);
1075
+ const logicalH = Math.round(physH / scaleFactor);
1076
+ const raw = image.toRawSync();
1077
+ const resized = await sharp3(raw, {
1078
+ raw: { width: physW, height: physH, channels: 4 }
1079
+ }).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
1080
+ const cleanBase64 = resized.toString("base64");
1081
+ const id = ctx.screenshots.save(
1082
+ cleanBase64,
1083
+ "image/jpeg",
1084
+ `window: ${winTitle}`
1085
+ );
1086
+ const { image: gridImage, gridRef } = await addCoordinateGrid(resized, logicalW, logicalH, {
1087
+ offsetX: winX,
1088
+ offsetY: winY
1089
+ });
1090
+ const gridBase64 = gridImage.toString("base64");
1091
+ return {
1092
+ type: "image",
1093
+ base64: gridBase64,
1094
+ mimeType: "image/jpeg",
1095
+ screenshotId: id,
1096
+ content: gridRef
1097
+ };
1098
+ }
1099
+ };
1100
+
1101
+ // src/tools/windows/command.ts
1102
+ import { z as z7 } from "zod";
900
1103
  import { exec } from "child_process";
901
1104
  var MAX_OUTPUT_LENGTH = 1e4;
902
1105
  var runCommandTool = {
903
1106
  name: "run_command",
904
1107
  description: "Execute a shell command and return its output. Uses PowerShell on Windows.",
905
- parameters: z5.object({
906
- command: z5.string().describe("The command to execute"),
907
- timeout: z5.number().positive().default(3e4).describe("Timeout in milliseconds")
1108
+ parameters: z7.object({
1109
+ command: z7.string().describe("The command to execute"),
1110
+ timeout: z7.number().positive().default(3e4).describe("Timeout in milliseconds")
908
1111
  }),
909
1112
  async execute(args) {
910
1113
  return new Promise((resolve) => {
@@ -935,14 +1138,14 @@ var runCommandTool = {
935
1138
  };
936
1139
 
937
1140
  // src/tools/file/read.ts
938
- import { z as z6 } from "zod";
1141
+ import { z as z8 } from "zod";
939
1142
  import { readFile } from "fs/promises";
940
1143
  var MAX_FILE_SIZE = 1e5;
941
1144
  var fileReadTool = {
942
1145
  name: "file_read",
943
1146
  description: "Read the contents of a file at the given path.",
944
- parameters: z6.object({
945
- path: z6.string().describe("Absolute path to the file")
1147
+ parameters: z8.object({
1148
+ path: z8.string().describe("Absolute path to the file")
946
1149
  }),
947
1150
  async execute(args) {
948
1151
  try {
@@ -962,15 +1165,15 @@ var fileReadTool = {
962
1165
  };
963
1166
 
964
1167
  // src/tools/file/write.ts
965
- import { z as z7 } from "zod";
1168
+ import { z as z9 } from "zod";
966
1169
  import { writeFile, mkdir } from "fs/promises";
967
1170
  import { dirname } from "path";
968
1171
  var fileWriteTool = {
969
1172
  name: "file_write",
970
1173
  description: "Write content to a file at the given path. Creates parent directories if needed.",
971
- parameters: z7.object({
972
- path: z7.string().describe("Absolute path to the file"),
973
- content: z7.string().describe("Content to write")
1174
+ parameters: z9.object({
1175
+ path: z9.string().describe("Absolute path to the file"),
1176
+ content: z9.string().describe("Content to write")
974
1177
  }),
975
1178
  async execute(args) {
976
1179
  try {
@@ -985,16 +1188,16 @@ var fileWriteTool = {
985
1188
  };
986
1189
 
987
1190
  // src/tools/file/image.ts
988
- import { z as z8 } from "zod";
1191
+ import { z as z10 } from "zod";
989
1192
  import { readFileSync as readFileSync2, existsSync as existsSync3 } from "fs";
990
1193
  import { extname } from "path";
991
1194
  var IMAGE_EXTS = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".bmp", ".webp"]);
992
1195
  var useLocalImageTool = {
993
1196
  name: "use_local_image",
994
1197
  description: "Load a local image file and get a screenshot ID for it. Use this to reference local images in your report via [Image:img_X].",
995
- parameters: z8.object({
996
- path: z8.string().describe("Absolute path to the image file"),
997
- label: z8.string().default("local").describe('Label for the image (e.g. "chart", "photo")')
1198
+ parameters: z10.object({
1199
+ path: z10.string().describe("Absolute path to the image file"),
1200
+ label: z10.string().default("local").describe('Label for the image (e.g. "chart", "photo")')
998
1201
  }),
999
1202
  async execute(args, ctx) {
1000
1203
  if (!existsSync3(args.path)) {
@@ -1018,12 +1221,12 @@ var useLocalImageTool = {
1018
1221
  };
1019
1222
 
1020
1223
  // src/tools/browser/navigate.ts
1021
- import { z as z9 } from "zod";
1224
+ import { z as z11 } from "zod";
1022
1225
  var browserNavigateTool = {
1023
1226
  name: "browser_navigate",
1024
1227
  description: "Navigate the browser to a URL.",
1025
- parameters: z9.object({
1026
- url: z9.string().describe("The URL to navigate to")
1228
+ parameters: z11.object({
1229
+ url: z11.string().describe("The URL to navigate to")
1027
1230
  }),
1028
1231
  async execute(args, ctx) {
1029
1232
  const browser = await ctx.getBrowser();
@@ -1036,12 +1239,12 @@ Page title: ${title}` };
1036
1239
  };
1037
1240
 
1038
1241
  // src/tools/browser/click.ts
1039
- import { z as z10 } from "zod";
1242
+ import { z as z12 } from "zod";
1040
1243
  var browserClickTool = {
1041
1244
  name: "browser_click",
1042
1245
  description: "Click an element on the web page using a CSS selector or text content.",
1043
- parameters: z10.object({
1044
- selector: z10.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
1246
+ parameters: z12.object({
1247
+ selector: z12.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
1045
1248
  }),
1046
1249
  async execute(args, ctx) {
1047
1250
  const browser = await ctx.getBrowser();
@@ -1052,14 +1255,14 @@ var browserClickTool = {
1052
1255
  };
1053
1256
 
1054
1257
  // src/tools/browser/type.ts
1055
- import { z as z11 } from "zod";
1258
+ import { z as z13 } from "zod";
1056
1259
  var browserTypeTool = {
1057
1260
  name: "browser_type",
1058
1261
  description: "Type text into an input field on the web page.",
1059
- parameters: z11.object({
1060
- selector: z11.string().describe("CSS selector for the input element"),
1061
- text: z11.string().describe("Text to type"),
1062
- clear: z11.boolean().default(true).describe("Whether to clear the field before typing")
1262
+ parameters: z13.object({
1263
+ selector: z13.string().describe("CSS selector for the input element"),
1264
+ text: z13.string().describe("Text to type"),
1265
+ clear: z13.boolean().default(true).describe("Whether to clear the field before typing")
1063
1266
  }),
1064
1267
  async execute(args, ctx) {
1065
1268
  const browser = await ctx.getBrowser();
@@ -1074,12 +1277,12 @@ var browserTypeTool = {
1074
1277
  };
1075
1278
 
1076
1279
  // src/tools/browser/screenshot.ts
1077
- import { z as z12 } from "zod";
1280
+ import { z as z14 } from "zod";
1078
1281
  var browserScreenshotTool = {
1079
1282
  name: "browser_screenshot",
1080
1283
  description: "Take a screenshot of the current browser page. Returns a screenshot ID (e.g. img_2) that you can reference later in report.",
1081
- parameters: z12.object({
1082
- fullPage: z12.boolean().default(false).describe("Whether to capture the full scrollable page")
1284
+ parameters: z14.object({
1285
+ fullPage: z14.boolean().default(false).describe("Whether to capture the full scrollable page")
1083
1286
  }),
1084
1287
  async execute(args, ctx) {
1085
1288
  const browser = await ctx.getBrowser();
@@ -1102,12 +1305,12 @@ var browserScreenshotTool = {
1102
1305
  };
1103
1306
 
1104
1307
  // src/tools/browser/content.ts
1105
- import { z as z13 } from "zod";
1308
+ import { z as z15 } from "zod";
1106
1309
  var MAX_CONTENT_LENGTH = 2e4;
1107
1310
  var browserContentTool = {
1108
1311
  name: "browser_content",
1109
1312
  description: "Get the text content of the current web page. Returns visible text, not HTML.",
1110
- parameters: z13.object({}),
1313
+ parameters: z15.object({}),
1111
1314
  async execute(_args, ctx) {
1112
1315
  const browser = await ctx.getBrowser();
1113
1316
  const page = await browser.getPage();
@@ -1128,13 +1331,13 @@ ${text}`
1128
1331
  };
1129
1332
 
1130
1333
  // src/tools/browser/scroll.ts
1131
- import { z as z14 } from "zod";
1334
+ import { z as z16 } from "zod";
1132
1335
  var browserScrollTool = {
1133
1336
  name: "browser_scroll",
1134
1337
  description: "Scroll the current web page.",
1135
- parameters: z14.object({
1136
- direction: z14.enum(["up", "down"]).describe("Scroll direction"),
1137
- amount: z14.number().positive().default(500).describe("Pixels to scroll")
1338
+ parameters: z16.object({
1339
+ direction: z16.enum(["up", "down"]).describe("Scroll direction"),
1340
+ amount: z16.number().positive().default(500).describe("Pixels to scroll")
1138
1341
  }),
1139
1342
  async execute(args, ctx) {
1140
1343
  const browser = await ctx.getBrowser();
@@ -1146,16 +1349,16 @@ var browserScrollTool = {
1146
1349
  };
1147
1350
 
1148
1351
  // src/tools/control/report.ts
1149
- import { z as z15 } from "zod";
1352
+ import { z as z17 } from "zod";
1150
1353
  var reportTool = {
1151
1354
  name: "report",
1152
1355
  description: 'Report progress back to the caller. Call this when the task is completed, when you are blocked, or when you need guidance. Calling this STOPS your execution immediately.\n\nThe content field supports rich document format: mix text with screenshots using [Image:img_1] markers. Example:\n"Here is the current state:\n[Image:img_2]\nThe page shows..."',
1153
- parameters: z15.object({
1154
- status: z15.enum(["completed", "blocked", "need_guidance"]).describe(
1356
+ parameters: z17.object({
1357
+ status: z17.enum(["completed", "blocked", "need_guidance"]).describe(
1155
1358
  '"completed" = task done, "blocked" = cannot proceed, "need_guidance" = need a decision'
1156
1359
  ),
1157
- content: z15.string().describe('Rich report content. Use [Image:img_X] to embed screenshots captured earlier. Example: "Task done.\\n[Image:img_1]\\nThe page shows the result."'),
1158
- data: z15.unknown().optional().describe("Optional structured data to return")
1360
+ content: z17.string().describe('Rich report content. Use [Image:img_X] to embed screenshots captured earlier. Example: "Task done.\\n[Image:img_1]\\nThe page shows the result."'),
1361
+ data: z17.unknown().optional().describe("Optional structured data to return")
1159
1362
  }),
1160
1363
  async execute(args) {
1161
1364
  return {
@@ -1176,6 +1379,11 @@ function createToolRegistry() {
1176
1379
  registry.register(mouseScrollTool);
1177
1380
  registry.register(keyboardTypeTool);
1178
1381
  registry.register(keyboardPressTool);
1382
+ registry.register(clipboardTypeTool);
1383
+ registry.register(switchInputMethodTool);
1384
+ registry.register(listWindowsTool);
1385
+ registry.register(focusWindowTool);
1386
+ registry.register(windowScreenshotTool);
1179
1387
  registry.register(runCommandTool);
1180
1388
  registry.register(fileReadTool);
1181
1389
  registry.register(fileWriteTool);