windows-use 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +65 -3
- package/dist/cli.js +286 -63
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.js +260 -52
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +280 -72
- package/dist/mcp/server.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
package/dist/index.js
CHANGED
|
@@ -97,15 +97,18 @@ function buildSystemPrompt() {
|
|
|
97
97
|
4. When the task is done, you are blocked, or you need guidance, call \`report\` immediately.
|
|
98
98
|
|
|
99
99
|
## Reading Screenshots
|
|
100
|
-
-
|
|
101
|
-
-
|
|
102
|
-
-
|
|
100
|
+
- Screenshots include a **coordinate grid overlay** with **numbered blue reference markers** at grid intersections.
|
|
101
|
+
- Each screenshot also returns a **text coordinate table** mapping marker numbers to exact screen coordinates, e.g. \`[1](200,200) [2](400,200)\`.
|
|
102
|
+
- **How to locate elements precisely**: Find the nearest blue numbered marker to your target in the image, look up its exact (x,y) from the coordinate table, then adjust for the offset.
|
|
103
|
+
- Example: A button is just right of marker \`[7]\`. The table says \`[7](600,400)\`. The button is ~50px right \u2192 click at (650, 400).
|
|
104
|
+
- The red edge labels and bottom-right dimension label are also available for reference.
|
|
103
105
|
|
|
104
106
|
## Tool Selection
|
|
105
107
|
- **Browser tasks**: Prefer \`browser_*\` tools (they use CSS selectors, more reliable than coordinates). Use \`browser_content\` to find text/elements when you can't locate them visually.
|
|
106
108
|
- **Desktop/native app tasks**: Use \`screenshot\` + \`mouse_click\`/\`keyboard_*\`. Read coordinates from the grid overlay.
|
|
107
109
|
- **Terminal tasks**: Prefer \`run_command\` over GUI interactions. It's faster and more reliable.
|
|
108
110
|
- **Mixed tasks**: You can combine all tool types. For example, use \`run_command\` to launch an app, then \`screenshot\` + mouse to interact with it.
|
|
111
|
+
- **Window management**: Use \`list_windows\` to see all open windows, \`focus_window\` to activate a specific window, and \`window_screenshot\` to capture a specific window (coordinates in the grid are screen-absolute, matching \`mouse_click\`). Focus a window before sending keyboard/mouse input to it.
|
|
109
112
|
|
|
110
113
|
## Smart Screenshot Strategy
|
|
111
114
|
- ALWAYS take a screenshot before your first action.
|
|
@@ -124,7 +127,12 @@ function buildSystemPrompt() {
|
|
|
124
127
|
- **Popups/dialogs**: Handle unexpected dialogs (cookie banners, notifications, confirmations) by dismissing or accepting them, then continue with the original task.
|
|
125
128
|
- **Dropdowns/menus**: Click to open, then screenshot to see options before selecting.
|
|
126
129
|
- **Scrolling**: If content is below the fold, scroll down and screenshot. Check both browser_scroll (for web pages) and mouse_scroll (for desktop apps).
|
|
127
|
-
- **Text input**:
|
|
130
|
+
- **Text input**:
|
|
131
|
+
- For browser forms, prefer \`browser_type\` with the CSS selector.
|
|
132
|
+
- For desktop apps, click the input field first, then type.
|
|
133
|
+
- Use \`clipboard_type\` (paste via clipboard) when: the text contains non-ASCII characters (Chinese, Japanese, etc.), the current IME might interfere, or you need fast input.
|
|
134
|
+
- Use \`keyboard_type\` (character-by-character) when: you need to trigger per-key events, or for simple ASCII text with English IME active.
|
|
135
|
+
- If \`keyboard_type\` produces garbled text, switch to \`clipboard_type\` or use \`switch_input_method\` to toggle the IME first.
|
|
128
136
|
- **Coordinate precision**: When clicking small UI elements (buttons, links, checkboxes), aim for their center. If a click misses, adjust coordinates and try once more.
|
|
129
137
|
|
|
130
138
|
## Error Recovery
|
|
@@ -300,11 +308,13 @@ var AgentRunner = class {
|
|
|
300
308
|
}
|
|
301
309
|
if (result.type === "image") {
|
|
302
310
|
this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: `Screenshot captured (${result.screenshotId})` });
|
|
311
|
+
const textPart = result.content ? `Screenshot captured. ID: ${result.screenshotId}
|
|
312
|
+
${result.content}` : `Screenshot captured. ID: ${result.screenshotId}`;
|
|
303
313
|
this.contextManager.append({
|
|
304
314
|
role: "tool",
|
|
305
315
|
tool_call_id: toolCall.id,
|
|
306
316
|
content: [
|
|
307
|
-
{ type: "text", text:
|
|
317
|
+
{ type: "text", text: textPart },
|
|
308
318
|
{
|
|
309
319
|
type: "image_url",
|
|
310
320
|
image_url: {
|
|
@@ -670,6 +680,8 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
|
|
|
670
680
|
const gridSpacing = options.gridSpacing ?? 100;
|
|
671
681
|
const labelSpacing = options.labelSpacing ?? 200;
|
|
672
682
|
const majorSpacing = gridSpacing * 5;
|
|
683
|
+
const offsetX = options.offsetX ?? 0;
|
|
684
|
+
const offsetY = options.offsetY ?? 0;
|
|
673
685
|
const svgParts = [];
|
|
674
686
|
for (let x = gridSpacing; x < width; x += gridSpacing) {
|
|
675
687
|
const isMajor = x % majorSpacing === 0;
|
|
@@ -687,8 +699,24 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
|
|
|
687
699
|
`<line x1="0" y1="${y}" x2="${width}" y2="${y}" stroke="rgba(255,50,50,${opacity})" stroke-width="${sw}"/>`
|
|
688
700
|
);
|
|
689
701
|
}
|
|
702
|
+
const markers = [];
|
|
703
|
+
let markerId = 1;
|
|
704
|
+
for (let y = labelSpacing; y < height; y += labelSpacing) {
|
|
705
|
+
for (let x = labelSpacing; x < width; x += labelSpacing) {
|
|
706
|
+
const screenX = x + offsetX;
|
|
707
|
+
const screenY = y + offsetY;
|
|
708
|
+
markers.push({ id: markerId, screenX, screenY });
|
|
709
|
+
const label = String(markerId);
|
|
710
|
+
const r = label.length > 1 ? 12 : 10;
|
|
711
|
+
svgParts.push(
|
|
712
|
+
`<circle cx="${x}" cy="${y}" r="${r}" fill="rgba(0,110,255,0.85)" stroke="white" stroke-width="1"/>`,
|
|
713
|
+
`<text x="${x}" y="${y + 4}" text-anchor="middle" fill="white" font-size="${label.length > 1 ? 9 : 10}" font-family="Consolas,monospace" font-weight="bold">${label}</text>`
|
|
714
|
+
);
|
|
715
|
+
markerId++;
|
|
716
|
+
}
|
|
717
|
+
}
|
|
690
718
|
for (let x = labelSpacing; x < width; x += labelSpacing) {
|
|
691
|
-
const text = String(x);
|
|
719
|
+
const text = String(x + offsetX);
|
|
692
720
|
const tw = text.length * 7.5 + 6;
|
|
693
721
|
svgParts.push(
|
|
694
722
|
`<rect x="${x - tw / 2}" y="2" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
|
|
@@ -696,16 +724,18 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
|
|
|
696
724
|
);
|
|
697
725
|
}
|
|
698
726
|
for (let y = labelSpacing; y < height; y += labelSpacing) {
|
|
699
|
-
const text = String(y);
|
|
727
|
+
const text = String(y + offsetY);
|
|
700
728
|
const tw = text.length * 7.5 + 6;
|
|
701
729
|
svgParts.push(
|
|
702
730
|
`<rect x="2" y="${y - 8}" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
|
|
703
731
|
`<text x="5" y="${y + 4}" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${text}</text>`
|
|
704
732
|
);
|
|
705
733
|
}
|
|
734
|
+
const originText = `${offsetX},${offsetY}`;
|
|
735
|
+
const originTw = originText.length * 7.5 + 6;
|
|
706
736
|
svgParts.push(
|
|
707
|
-
`<rect x="2" y="2" width="
|
|
708
|
-
`<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold"
|
|
737
|
+
`<rect x="2" y="2" width="${originTw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
|
|
738
|
+
`<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${originText}</text>`
|
|
709
739
|
);
|
|
710
740
|
const dimText = `${width}x${height}`;
|
|
711
741
|
const dimTw = dimText.length * 7.5 + 6;
|
|
@@ -716,7 +746,16 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
|
|
|
716
746
|
const svg = Buffer.from(
|
|
717
747
|
`<svg width="${width}" height="${height}" xmlns="http://www.w3.org/2000/svg">${svgParts.join("")}</svg>`
|
|
718
748
|
);
|
|
719
|
-
|
|
749
|
+
const image = await sharp(imageBuffer).composite([{ input: svg, top: 0, left: 0 }]).jpeg({ quality: 70 }).toBuffer();
|
|
750
|
+
const cols = Math.floor((width - 1) / labelSpacing);
|
|
751
|
+
const rows = [];
|
|
752
|
+
for (let i = 0; i < markers.length; i += cols) {
|
|
753
|
+
const row = markers.slice(i, i + cols).map((m) => `[${m.id}](${m.screenX},${m.screenY})`).join(" ");
|
|
754
|
+
rows.push(row);
|
|
755
|
+
}
|
|
756
|
+
const gridRef = `Grid reference points (marker \u2192 screen coordinates for mouse_click):
|
|
757
|
+
${rows.join("\n")}`;
|
|
758
|
+
return { image, gridRef };
|
|
720
759
|
}
|
|
721
760
|
|
|
722
761
|
// src/tools/windows/screenshot.ts
|
|
@@ -743,13 +782,14 @@ var screenshotTool = {
|
|
|
743
782
|
}).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
|
|
744
783
|
const cleanBase64 = resized.toString("base64");
|
|
745
784
|
const id = ctx.screenshots.save(cleanBase64, "image/jpeg", "desktop");
|
|
746
|
-
const gridImage = await addCoordinateGrid(resized, logicalW, logicalH);
|
|
785
|
+
const { image: gridImage, gridRef } = await addCoordinateGrid(resized, logicalW, logicalH);
|
|
747
786
|
const gridBase64 = gridImage.toString("base64");
|
|
748
787
|
return {
|
|
749
788
|
type: "image",
|
|
750
789
|
base64: gridBase64,
|
|
751
790
|
mimeType: "image/jpeg",
|
|
752
|
-
screenshotId: id
|
|
791
|
+
screenshotId: id,
|
|
792
|
+
content: gridRef
|
|
753
793
|
};
|
|
754
794
|
}
|
|
755
795
|
};
|
|
@@ -895,16 +935,179 @@ var keyboardPressTool = {
|
|
|
895
935
|
}
|
|
896
936
|
};
|
|
897
937
|
|
|
898
|
-
// src/tools/windows/
|
|
938
|
+
// src/tools/windows/clipboard.ts
|
|
899
939
|
import { z as z5 } from "zod";
|
|
940
|
+
async function getNutJs3() {
|
|
941
|
+
return import("@nut-tree-fork/nut-js");
|
|
942
|
+
}
|
|
943
|
+
var clipboardTypeTool = {
|
|
944
|
+
name: "clipboard_type",
|
|
945
|
+
description: "Type text by copying it to the clipboard and pasting (Ctrl+V). This bypasses input method (IME) issues and is faster than keyboard_type. Use this when the current IME might interfere, or for non-ASCII text (Chinese, Japanese, etc.).",
|
|
946
|
+
parameters: z5.object({
|
|
947
|
+
text: z5.string().describe("The text to paste")
|
|
948
|
+
}),
|
|
949
|
+
async execute(args) {
|
|
950
|
+
const nut = await getNutJs3();
|
|
951
|
+
await nut.clipboard.setContent(args.text);
|
|
952
|
+
await nut.keyboard.pressKey(nut.Key.LeftControl, nut.Key.V);
|
|
953
|
+
await nut.keyboard.releaseKey(nut.Key.LeftControl, nut.Key.V);
|
|
954
|
+
return { type: "text", content: `Pasted: "${args.text}"` };
|
|
955
|
+
}
|
|
956
|
+
};
|
|
957
|
+
var switchInputMethodTool = {
|
|
958
|
+
name: "switch_input_method",
|
|
959
|
+
description: "Toggle the input method (IME) by pressing Win+Space. Use this before keyboard_type if the current IME is wrong. Take a screenshot afterward to verify the switch.",
|
|
960
|
+
parameters: z5.object({}),
|
|
961
|
+
async execute() {
|
|
962
|
+
const nut = await getNutJs3();
|
|
963
|
+
await nut.keyboard.pressKey(nut.Key.LeftWin, nut.Key.Space);
|
|
964
|
+
await nut.keyboard.releaseKey(nut.Key.LeftWin, nut.Key.Space);
|
|
965
|
+
return { type: "text", content: "Toggled input method (Win+Space)" };
|
|
966
|
+
}
|
|
967
|
+
};
|
|
968
|
+
|
|
969
|
+
// src/tools/windows/window.ts
|
|
970
|
+
import { z as z6 } from "zod";
|
|
971
|
+
import sharp3 from "sharp";
|
|
972
|
+
async function getNutJs4() {
|
|
973
|
+
return import("@nut-tree-fork/nut-js");
|
|
974
|
+
}
|
|
975
|
+
async function getNodeScreenshots() {
|
|
976
|
+
return import("node-screenshots");
|
|
977
|
+
}
|
|
978
|
+
function findWindowByTitle(windows, title) {
|
|
979
|
+
const lower = title.toLowerCase();
|
|
980
|
+
const exact = windows.find(
|
|
981
|
+
(w) => w.title().toLowerCase() === lower
|
|
982
|
+
);
|
|
983
|
+
if (exact) return exact;
|
|
984
|
+
return windows.find(
|
|
985
|
+
(w) => w.title().toLowerCase().includes(lower)
|
|
986
|
+
);
|
|
987
|
+
}
|
|
988
|
+
var listWindowsTool = {
|
|
989
|
+
name: "list_windows",
|
|
990
|
+
description: "List all visible windows with their titles, positions, and sizes.",
|
|
991
|
+
parameters: z6.object({}),
|
|
992
|
+
async execute() {
|
|
993
|
+
const { Window } = await getNodeScreenshots();
|
|
994
|
+
const windows = Window.all();
|
|
995
|
+
const list = windows.filter((w) => w.title().trim().length > 0).map((w) => ({
|
|
996
|
+
id: w.id(),
|
|
997
|
+
title: w.title(),
|
|
998
|
+
appName: w.appName(),
|
|
999
|
+
x: w.x(),
|
|
1000
|
+
y: w.y(),
|
|
1001
|
+
width: w.width(),
|
|
1002
|
+
height: w.height(),
|
|
1003
|
+
isMinimized: w.isMinimized(),
|
|
1004
|
+
isFocused: w.isFocused()
|
|
1005
|
+
}));
|
|
1006
|
+
const formatted = list.map(
|
|
1007
|
+
(w) => `[${w.isFocused ? "*" : " "}] "${w.title}" (${w.appName}) \u2014 pos:(${w.x},${w.y}) size:${w.width}x${w.height}${w.isMinimized ? " [minimized]" : ""}`
|
|
1008
|
+
).join("\n");
|
|
1009
|
+
return {
|
|
1010
|
+
type: "text",
|
|
1011
|
+
content: `Found ${list.length} windows:
|
|
1012
|
+
${formatted}`
|
|
1013
|
+
};
|
|
1014
|
+
}
|
|
1015
|
+
};
|
|
1016
|
+
var focusWindowTool = {
|
|
1017
|
+
name: "focus_window",
|
|
1018
|
+
description: "Focus (activate) a window by its title. Uses partial, case-insensitive matching.",
|
|
1019
|
+
parameters: z6.object({
|
|
1020
|
+
title: z6.string().describe("Window title to search for (partial match)")
|
|
1021
|
+
}),
|
|
1022
|
+
async execute(args) {
|
|
1023
|
+
const nut = await getNutJs4();
|
|
1024
|
+
const windows = await nut.getWindows();
|
|
1025
|
+
const lower = args.title.toLowerCase();
|
|
1026
|
+
let target = null;
|
|
1027
|
+
for (const w of windows) {
|
|
1028
|
+
const t = await w.title;
|
|
1029
|
+
if (t.toLowerCase() === lower) {
|
|
1030
|
+
target = w;
|
|
1031
|
+
break;
|
|
1032
|
+
}
|
|
1033
|
+
if (!target && t.toLowerCase().includes(lower)) {
|
|
1034
|
+
target = w;
|
|
1035
|
+
}
|
|
1036
|
+
}
|
|
1037
|
+
if (!target) {
|
|
1038
|
+
return {
|
|
1039
|
+
type: "text",
|
|
1040
|
+
content: `Error: No window found matching "${args.title}"`
|
|
1041
|
+
};
|
|
1042
|
+
}
|
|
1043
|
+
const title = await target.title;
|
|
1044
|
+
await target.focus();
|
|
1045
|
+
return { type: "text", content: `Focused window: "${title}"` };
|
|
1046
|
+
}
|
|
1047
|
+
};
|
|
1048
|
+
var windowScreenshotTool = {
|
|
1049
|
+
name: "window_screenshot",
|
|
1050
|
+
description: "Capture a screenshot of a specific window by its title. The coordinate grid shows screen-absolute coordinates (matching mouse_click). Returns a screenshot ID.",
|
|
1051
|
+
parameters: z6.object({
|
|
1052
|
+
title: z6.string().describe("Window title to search for (partial match)")
|
|
1053
|
+
}),
|
|
1054
|
+
async execute(args, ctx) {
|
|
1055
|
+
const { Window, Monitor } = await getNodeScreenshots();
|
|
1056
|
+
const windows = Window.all().filter(
|
|
1057
|
+
(w) => w.title().trim().length > 0
|
|
1058
|
+
);
|
|
1059
|
+
const target = findWindowByTitle(windows, args.title);
|
|
1060
|
+
if (!target) {
|
|
1061
|
+
return {
|
|
1062
|
+
type: "text",
|
|
1063
|
+
content: `Error: No window found matching "${args.title}"`
|
|
1064
|
+
};
|
|
1065
|
+
}
|
|
1066
|
+
const winTitle = target.title();
|
|
1067
|
+
const winX = target.x();
|
|
1068
|
+
const winY = target.y();
|
|
1069
|
+
const image = target.captureImageSync();
|
|
1070
|
+
const physW = image.width;
|
|
1071
|
+
const physH = image.height;
|
|
1072
|
+
const monitor = target.currentMonitor();
|
|
1073
|
+
const scaleFactor = monitor ? monitor.scaleFactor() : 1;
|
|
1074
|
+
const logicalW = Math.round(physW / scaleFactor);
|
|
1075
|
+
const logicalH = Math.round(physH / scaleFactor);
|
|
1076
|
+
const raw = image.toRawSync();
|
|
1077
|
+
const resized = await sharp3(raw, {
|
|
1078
|
+
raw: { width: physW, height: physH, channels: 4 }
|
|
1079
|
+
}).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
|
|
1080
|
+
const cleanBase64 = resized.toString("base64");
|
|
1081
|
+
const id = ctx.screenshots.save(
|
|
1082
|
+
cleanBase64,
|
|
1083
|
+
"image/jpeg",
|
|
1084
|
+
`window: ${winTitle}`
|
|
1085
|
+
);
|
|
1086
|
+
const { image: gridImage, gridRef } = await addCoordinateGrid(resized, logicalW, logicalH, {
|
|
1087
|
+
offsetX: winX,
|
|
1088
|
+
offsetY: winY
|
|
1089
|
+
});
|
|
1090
|
+
const gridBase64 = gridImage.toString("base64");
|
|
1091
|
+
return {
|
|
1092
|
+
type: "image",
|
|
1093
|
+
base64: gridBase64,
|
|
1094
|
+
mimeType: "image/jpeg",
|
|
1095
|
+
screenshotId: id,
|
|
1096
|
+
content: gridRef
|
|
1097
|
+
};
|
|
1098
|
+
}
|
|
1099
|
+
};
|
|
1100
|
+
|
|
1101
|
+
// src/tools/windows/command.ts
|
|
1102
|
+
import { z as z7 } from "zod";
|
|
900
1103
|
import { exec } from "child_process";
|
|
901
1104
|
var MAX_OUTPUT_LENGTH = 1e4;
|
|
902
1105
|
var runCommandTool = {
|
|
903
1106
|
name: "run_command",
|
|
904
1107
|
description: "Execute a shell command and return its output. Uses PowerShell on Windows.",
|
|
905
|
-
parameters:
|
|
906
|
-
command:
|
|
907
|
-
timeout:
|
|
1108
|
+
parameters: z7.object({
|
|
1109
|
+
command: z7.string().describe("The command to execute"),
|
|
1110
|
+
timeout: z7.number().positive().default(3e4).describe("Timeout in milliseconds")
|
|
908
1111
|
}),
|
|
909
1112
|
async execute(args) {
|
|
910
1113
|
return new Promise((resolve) => {
|
|
@@ -935,14 +1138,14 @@ var runCommandTool = {
|
|
|
935
1138
|
};
|
|
936
1139
|
|
|
937
1140
|
// src/tools/file/read.ts
|
|
938
|
-
import { z as
|
|
1141
|
+
import { z as z8 } from "zod";
|
|
939
1142
|
import { readFile } from "fs/promises";
|
|
940
1143
|
var MAX_FILE_SIZE = 1e5;
|
|
941
1144
|
var fileReadTool = {
|
|
942
1145
|
name: "file_read",
|
|
943
1146
|
description: "Read the contents of a file at the given path.",
|
|
944
|
-
parameters:
|
|
945
|
-
path:
|
|
1147
|
+
parameters: z8.object({
|
|
1148
|
+
path: z8.string().describe("Absolute path to the file")
|
|
946
1149
|
}),
|
|
947
1150
|
async execute(args) {
|
|
948
1151
|
try {
|
|
@@ -962,15 +1165,15 @@ var fileReadTool = {
|
|
|
962
1165
|
};
|
|
963
1166
|
|
|
964
1167
|
// src/tools/file/write.ts
|
|
965
|
-
import { z as
|
|
1168
|
+
import { z as z9 } from "zod";
|
|
966
1169
|
import { writeFile, mkdir } from "fs/promises";
|
|
967
1170
|
import { dirname } from "path";
|
|
968
1171
|
var fileWriteTool = {
|
|
969
1172
|
name: "file_write",
|
|
970
1173
|
description: "Write content to a file at the given path. Creates parent directories if needed.",
|
|
971
|
-
parameters:
|
|
972
|
-
path:
|
|
973
|
-
content:
|
|
1174
|
+
parameters: z9.object({
|
|
1175
|
+
path: z9.string().describe("Absolute path to the file"),
|
|
1176
|
+
content: z9.string().describe("Content to write")
|
|
974
1177
|
}),
|
|
975
1178
|
async execute(args) {
|
|
976
1179
|
try {
|
|
@@ -985,16 +1188,16 @@ var fileWriteTool = {
|
|
|
985
1188
|
};
|
|
986
1189
|
|
|
987
1190
|
// src/tools/file/image.ts
|
|
988
|
-
import { z as
|
|
1191
|
+
import { z as z10 } from "zod";
|
|
989
1192
|
import { readFileSync as readFileSync2, existsSync as existsSync3 } from "fs";
|
|
990
1193
|
import { extname } from "path";
|
|
991
1194
|
var IMAGE_EXTS = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".bmp", ".webp"]);
|
|
992
1195
|
var useLocalImageTool = {
|
|
993
1196
|
name: "use_local_image",
|
|
994
1197
|
description: "Load a local image file and get a screenshot ID for it. Use this to reference local images in your report via [Image:img_X].",
|
|
995
|
-
parameters:
|
|
996
|
-
path:
|
|
997
|
-
label:
|
|
1198
|
+
parameters: z10.object({
|
|
1199
|
+
path: z10.string().describe("Absolute path to the image file"),
|
|
1200
|
+
label: z10.string().default("local").describe('Label for the image (e.g. "chart", "photo")')
|
|
998
1201
|
}),
|
|
999
1202
|
async execute(args, ctx) {
|
|
1000
1203
|
if (!existsSync3(args.path)) {
|
|
@@ -1018,12 +1221,12 @@ var useLocalImageTool = {
|
|
|
1018
1221
|
};
|
|
1019
1222
|
|
|
1020
1223
|
// src/tools/browser/navigate.ts
|
|
1021
|
-
import { z as
|
|
1224
|
+
import { z as z11 } from "zod";
|
|
1022
1225
|
var browserNavigateTool = {
|
|
1023
1226
|
name: "browser_navigate",
|
|
1024
1227
|
description: "Navigate the browser to a URL.",
|
|
1025
|
-
parameters:
|
|
1026
|
-
url:
|
|
1228
|
+
parameters: z11.object({
|
|
1229
|
+
url: z11.string().describe("The URL to navigate to")
|
|
1027
1230
|
}),
|
|
1028
1231
|
async execute(args, ctx) {
|
|
1029
1232
|
const browser = await ctx.getBrowser();
|
|
@@ -1036,12 +1239,12 @@ Page title: ${title}` };
|
|
|
1036
1239
|
};
|
|
1037
1240
|
|
|
1038
1241
|
// src/tools/browser/click.ts
|
|
1039
|
-
import { z as
|
|
1242
|
+
import { z as z12 } from "zod";
|
|
1040
1243
|
var browserClickTool = {
|
|
1041
1244
|
name: "browser_click",
|
|
1042
1245
|
description: "Click an element on the web page using a CSS selector or text content.",
|
|
1043
|
-
parameters:
|
|
1044
|
-
selector:
|
|
1246
|
+
parameters: z12.object({
|
|
1247
|
+
selector: z12.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
|
|
1045
1248
|
}),
|
|
1046
1249
|
async execute(args, ctx) {
|
|
1047
1250
|
const browser = await ctx.getBrowser();
|
|
@@ -1052,14 +1255,14 @@ var browserClickTool = {
|
|
|
1052
1255
|
};
|
|
1053
1256
|
|
|
1054
1257
|
// src/tools/browser/type.ts
|
|
1055
|
-
import { z as
|
|
1258
|
+
import { z as z13 } from "zod";
|
|
1056
1259
|
var browserTypeTool = {
|
|
1057
1260
|
name: "browser_type",
|
|
1058
1261
|
description: "Type text into an input field on the web page.",
|
|
1059
|
-
parameters:
|
|
1060
|
-
selector:
|
|
1061
|
-
text:
|
|
1062
|
-
clear:
|
|
1262
|
+
parameters: z13.object({
|
|
1263
|
+
selector: z13.string().describe("CSS selector for the input element"),
|
|
1264
|
+
text: z13.string().describe("Text to type"),
|
|
1265
|
+
clear: z13.boolean().default(true).describe("Whether to clear the field before typing")
|
|
1063
1266
|
}),
|
|
1064
1267
|
async execute(args, ctx) {
|
|
1065
1268
|
const browser = await ctx.getBrowser();
|
|
@@ -1074,12 +1277,12 @@ var browserTypeTool = {
|
|
|
1074
1277
|
};
|
|
1075
1278
|
|
|
1076
1279
|
// src/tools/browser/screenshot.ts
|
|
1077
|
-
import { z as
|
|
1280
|
+
import { z as z14 } from "zod";
|
|
1078
1281
|
var browserScreenshotTool = {
|
|
1079
1282
|
name: "browser_screenshot",
|
|
1080
1283
|
description: "Take a screenshot of the current browser page. Returns a screenshot ID (e.g. img_2) that you can reference later in report.",
|
|
1081
|
-
parameters:
|
|
1082
|
-
fullPage:
|
|
1284
|
+
parameters: z14.object({
|
|
1285
|
+
fullPage: z14.boolean().default(false).describe("Whether to capture the full scrollable page")
|
|
1083
1286
|
}),
|
|
1084
1287
|
async execute(args, ctx) {
|
|
1085
1288
|
const browser = await ctx.getBrowser();
|
|
@@ -1102,12 +1305,12 @@ var browserScreenshotTool = {
|
|
|
1102
1305
|
};
|
|
1103
1306
|
|
|
1104
1307
|
// src/tools/browser/content.ts
|
|
1105
|
-
import { z as
|
|
1308
|
+
import { z as z15 } from "zod";
|
|
1106
1309
|
var MAX_CONTENT_LENGTH = 2e4;
|
|
1107
1310
|
var browserContentTool = {
|
|
1108
1311
|
name: "browser_content",
|
|
1109
1312
|
description: "Get the text content of the current web page. Returns visible text, not HTML.",
|
|
1110
|
-
parameters:
|
|
1313
|
+
parameters: z15.object({}),
|
|
1111
1314
|
async execute(_args, ctx) {
|
|
1112
1315
|
const browser = await ctx.getBrowser();
|
|
1113
1316
|
const page = await browser.getPage();
|
|
@@ -1128,13 +1331,13 @@ ${text}`
|
|
|
1128
1331
|
};
|
|
1129
1332
|
|
|
1130
1333
|
// src/tools/browser/scroll.ts
|
|
1131
|
-
import { z as
|
|
1334
|
+
import { z as z16 } from "zod";
|
|
1132
1335
|
var browserScrollTool = {
|
|
1133
1336
|
name: "browser_scroll",
|
|
1134
1337
|
description: "Scroll the current web page.",
|
|
1135
|
-
parameters:
|
|
1136
|
-
direction:
|
|
1137
|
-
amount:
|
|
1338
|
+
parameters: z16.object({
|
|
1339
|
+
direction: z16.enum(["up", "down"]).describe("Scroll direction"),
|
|
1340
|
+
amount: z16.number().positive().default(500).describe("Pixels to scroll")
|
|
1138
1341
|
}),
|
|
1139
1342
|
async execute(args, ctx) {
|
|
1140
1343
|
const browser = await ctx.getBrowser();
|
|
@@ -1146,16 +1349,16 @@ var browserScrollTool = {
|
|
|
1146
1349
|
};
|
|
1147
1350
|
|
|
1148
1351
|
// src/tools/control/report.ts
|
|
1149
|
-
import { z as
|
|
1352
|
+
import { z as z17 } from "zod";
|
|
1150
1353
|
var reportTool = {
|
|
1151
1354
|
name: "report",
|
|
1152
1355
|
description: 'Report progress back to the caller. Call this when the task is completed, when you are blocked, or when you need guidance. Calling this STOPS your execution immediately.\n\nThe content field supports rich document format: mix text with screenshots using [Image:img_1] markers. Example:\n"Here is the current state:\n[Image:img_2]\nThe page shows..."',
|
|
1153
|
-
parameters:
|
|
1154
|
-
status:
|
|
1356
|
+
parameters: z17.object({
|
|
1357
|
+
status: z17.enum(["completed", "blocked", "need_guidance"]).describe(
|
|
1155
1358
|
'"completed" = task done, "blocked" = cannot proceed, "need_guidance" = need a decision'
|
|
1156
1359
|
),
|
|
1157
|
-
content:
|
|
1158
|
-
data:
|
|
1360
|
+
content: z17.string().describe('Rich report content. Use [Image:img_X] to embed screenshots captured earlier. Example: "Task done.\\n[Image:img_1]\\nThe page shows the result."'),
|
|
1361
|
+
data: z17.unknown().optional().describe("Optional structured data to return")
|
|
1159
1362
|
}),
|
|
1160
1363
|
async execute(args) {
|
|
1161
1364
|
return {
|
|
@@ -1176,6 +1379,11 @@ function createToolRegistry() {
|
|
|
1176
1379
|
registry.register(mouseScrollTool);
|
|
1177
1380
|
registry.register(keyboardTypeTool);
|
|
1178
1381
|
registry.register(keyboardPressTool);
|
|
1382
|
+
registry.register(clipboardTypeTool);
|
|
1383
|
+
registry.register(switchInputMethodTool);
|
|
1384
|
+
registry.register(listWindowsTool);
|
|
1385
|
+
registry.register(focusWindowTool);
|
|
1386
|
+
registry.register(windowScreenshotTool);
|
|
1179
1387
|
registry.register(runCommandTool);
|
|
1180
1388
|
registry.register(fileReadTool);
|
|
1181
1389
|
registry.register(fileWriteTool);
|