windows-use 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +65 -3
- package/dist/cli.js +286 -63
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +1 -0
- package/dist/index.js +260 -52
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +280 -72
- package/dist/mcp/server.js.map +1 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -126,15 +126,18 @@ function buildSystemPrompt() {
|
|
|
126
126
|
4. When the task is done, you are blocked, or you need guidance, call \`report\` immediately.
|
|
127
127
|
|
|
128
128
|
## Reading Screenshots
|
|
129
|
-
-
|
|
130
|
-
-
|
|
131
|
-
-
|
|
129
|
+
- Screenshots include a **coordinate grid overlay** with **numbered blue reference markers** at grid intersections.
|
|
130
|
+
- Each screenshot also returns a **text coordinate table** mapping marker numbers to exact screen coordinates, e.g. \`[1](200,200) [2](400,200)\`.
|
|
131
|
+
- **How to locate elements precisely**: Find the nearest blue numbered marker to your target in the image, look up its exact (x,y) from the coordinate table, then adjust for the offset.
|
|
132
|
+
- Example: A button is just right of marker \`[7]\`. The table says \`[7](600,400)\`. The button is ~50px right \u2192 click at (650, 400).
|
|
133
|
+
- The red edge labels and bottom-right dimension label are also available for reference.
|
|
132
134
|
|
|
133
135
|
## Tool Selection
|
|
134
136
|
- **Browser tasks**: Prefer \`browser_*\` tools (they use CSS selectors, more reliable than coordinates). Use \`browser_content\` to find text/elements when you can't locate them visually.
|
|
135
137
|
- **Desktop/native app tasks**: Use \`screenshot\` + \`mouse_click\`/\`keyboard_*\`. Read coordinates from the grid overlay.
|
|
136
138
|
- **Terminal tasks**: Prefer \`run_command\` over GUI interactions. It's faster and more reliable.
|
|
137
139
|
- **Mixed tasks**: You can combine all tool types. For example, use \`run_command\` to launch an app, then \`screenshot\` + mouse to interact with it.
|
|
140
|
+
- **Window management**: Use \`list_windows\` to see all open windows, \`focus_window\` to activate a specific window, and \`window_screenshot\` to capture a specific window (coordinates in the grid are screen-absolute, matching \`mouse_click\`). Focus a window before sending keyboard/mouse input to it.
|
|
138
141
|
|
|
139
142
|
## Smart Screenshot Strategy
|
|
140
143
|
- ALWAYS take a screenshot before your first action.
|
|
@@ -153,7 +156,12 @@ function buildSystemPrompt() {
|
|
|
153
156
|
- **Popups/dialogs**: Handle unexpected dialogs (cookie banners, notifications, confirmations) by dismissing or accepting them, then continue with the original task.
|
|
154
157
|
- **Dropdowns/menus**: Click to open, then screenshot to see options before selecting.
|
|
155
158
|
- **Scrolling**: If content is below the fold, scroll down and screenshot. Check both browser_scroll (for web pages) and mouse_scroll (for desktop apps).
|
|
156
|
-
- **Text input**:
|
|
159
|
+
- **Text input**:
|
|
160
|
+
- For browser forms, prefer \`browser_type\` with the CSS selector.
|
|
161
|
+
- For desktop apps, click the input field first, then type.
|
|
162
|
+
- Use \`clipboard_type\` (paste via clipboard) when: the text contains non-ASCII characters (Chinese, Japanese, etc.), the current IME might interfere, or you need fast input.
|
|
163
|
+
- Use \`keyboard_type\` (character-by-character) when: you need to trigger per-key events, or for simple ASCII text with English IME active.
|
|
164
|
+
- If \`keyboard_type\` produces garbled text, switch to \`clipboard_type\` or use \`switch_input_method\` to toggle the IME first.
|
|
157
165
|
- **Coordinate precision**: When clicking small UI elements (buttons, links, checkboxes), aim for their center. If a click misses, adjust coordinates and try once more.
|
|
158
166
|
|
|
159
167
|
## Error Recovery
|
|
@@ -339,11 +347,13 @@ var init_runner = __esm({
|
|
|
339
347
|
}
|
|
340
348
|
if (result.type === "image") {
|
|
341
349
|
this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: `Screenshot captured (${result.screenshotId})` });
|
|
350
|
+
const textPart = result.content ? `Screenshot captured. ID: ${result.screenshotId}
|
|
351
|
+
${result.content}` : `Screenshot captured. ID: ${result.screenshotId}`;
|
|
342
352
|
this.contextManager.append({
|
|
343
353
|
role: "tool",
|
|
344
354
|
tool_call_id: toolCall.id,
|
|
345
355
|
content: [
|
|
346
|
-
{ type: "text", text:
|
|
356
|
+
{ type: "text", text: textPart },
|
|
347
357
|
{
|
|
348
358
|
type: "image_url",
|
|
349
359
|
image_url: {
|
|
@@ -725,6 +735,8 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
|
|
|
725
735
|
const gridSpacing = options.gridSpacing ?? 100;
|
|
726
736
|
const labelSpacing = options.labelSpacing ?? 200;
|
|
727
737
|
const majorSpacing = gridSpacing * 5;
|
|
738
|
+
const offsetX = options.offsetX ?? 0;
|
|
739
|
+
const offsetY = options.offsetY ?? 0;
|
|
728
740
|
const svgParts = [];
|
|
729
741
|
for (let x = gridSpacing; x < width; x += gridSpacing) {
|
|
730
742
|
const isMajor = x % majorSpacing === 0;
|
|
@@ -742,8 +754,24 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
|
|
|
742
754
|
`<line x1="0" y1="${y}" x2="${width}" y2="${y}" stroke="rgba(255,50,50,${opacity})" stroke-width="${sw}"/>`
|
|
743
755
|
);
|
|
744
756
|
}
|
|
757
|
+
const markers = [];
|
|
758
|
+
let markerId = 1;
|
|
759
|
+
for (let y = labelSpacing; y < height; y += labelSpacing) {
|
|
760
|
+
for (let x = labelSpacing; x < width; x += labelSpacing) {
|
|
761
|
+
const screenX = x + offsetX;
|
|
762
|
+
const screenY = y + offsetY;
|
|
763
|
+
markers.push({ id: markerId, screenX, screenY });
|
|
764
|
+
const label = String(markerId);
|
|
765
|
+
const r = label.length > 1 ? 12 : 10;
|
|
766
|
+
svgParts.push(
|
|
767
|
+
`<circle cx="${x}" cy="${y}" r="${r}" fill="rgba(0,110,255,0.85)" stroke="white" stroke-width="1"/>`,
|
|
768
|
+
`<text x="${x}" y="${y + 4}" text-anchor="middle" fill="white" font-size="${label.length > 1 ? 9 : 10}" font-family="Consolas,monospace" font-weight="bold">${label}</text>`
|
|
769
|
+
);
|
|
770
|
+
markerId++;
|
|
771
|
+
}
|
|
772
|
+
}
|
|
745
773
|
for (let x = labelSpacing; x < width; x += labelSpacing) {
|
|
746
|
-
const text = String(x);
|
|
774
|
+
const text = String(x + offsetX);
|
|
747
775
|
const tw = text.length * 7.5 + 6;
|
|
748
776
|
svgParts.push(
|
|
749
777
|
`<rect x="${x - tw / 2}" y="2" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
|
|
@@ -751,16 +779,18 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
|
|
|
751
779
|
);
|
|
752
780
|
}
|
|
753
781
|
for (let y = labelSpacing; y < height; y += labelSpacing) {
|
|
754
|
-
const text = String(y);
|
|
782
|
+
const text = String(y + offsetY);
|
|
755
783
|
const tw = text.length * 7.5 + 6;
|
|
756
784
|
svgParts.push(
|
|
757
785
|
`<rect x="2" y="${y - 8}" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
|
|
758
786
|
`<text x="5" y="${y + 4}" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${text}</text>`
|
|
759
787
|
);
|
|
760
788
|
}
|
|
789
|
+
const originText = `${offsetX},${offsetY}`;
|
|
790
|
+
const originTw = originText.length * 7.5 + 6;
|
|
761
791
|
svgParts.push(
|
|
762
|
-
`<rect x="2" y="2" width="
|
|
763
|
-
`<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold"
|
|
792
|
+
`<rect x="2" y="2" width="${originTw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
|
|
793
|
+
`<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${originText}</text>`
|
|
764
794
|
);
|
|
765
795
|
const dimText = `${width}x${height}`;
|
|
766
796
|
const dimTw = dimText.length * 7.5 + 6;
|
|
@@ -771,7 +801,16 @@ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
|
|
|
771
801
|
const svg = Buffer.from(
|
|
772
802
|
`<svg width="${width}" height="${height}" xmlns="http://www.w3.org/2000/svg">${svgParts.join("")}</svg>`
|
|
773
803
|
);
|
|
774
|
-
|
|
804
|
+
const image = await sharp(imageBuffer).composite([{ input: svg, top: 0, left: 0 }]).jpeg({ quality: 70 }).toBuffer();
|
|
805
|
+
const cols = Math.floor((width - 1) / labelSpacing);
|
|
806
|
+
const rows = [];
|
|
807
|
+
for (let i = 0; i < markers.length; i += cols) {
|
|
808
|
+
const row = markers.slice(i, i + cols).map((m) => `[${m.id}](${m.screenX},${m.screenY})`).join(" ");
|
|
809
|
+
rows.push(row);
|
|
810
|
+
}
|
|
811
|
+
const gridRef = `Grid reference points (marker \u2192 screen coordinates for mouse_click):
|
|
812
|
+
${rows.join("\n")}`;
|
|
813
|
+
return { image, gridRef };
|
|
775
814
|
}
|
|
776
815
|
var init_grid_overlay = __esm({
|
|
777
816
|
"src/tools/windows/grid-overlay.ts"() {
|
|
@@ -810,13 +849,14 @@ var init_screenshot = __esm({
|
|
|
810
849
|
}).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
|
|
811
850
|
const cleanBase64 = resized.toString("base64");
|
|
812
851
|
const id = ctx.screenshots.save(cleanBase64, "image/jpeg", "desktop");
|
|
813
|
-
const gridImage = await addCoordinateGrid(resized, logicalW, logicalH);
|
|
852
|
+
const { image: gridImage, gridRef } = await addCoordinateGrid(resized, logicalW, logicalH);
|
|
814
853
|
const gridBase64 = gridImage.toString("base64");
|
|
815
854
|
return {
|
|
816
855
|
type: "image",
|
|
817
856
|
base64: gridBase64,
|
|
818
857
|
mimeType: "image/jpeg",
|
|
819
|
-
screenshotId: id
|
|
858
|
+
screenshotId: id,
|
|
859
|
+
content: gridRef
|
|
820
860
|
};
|
|
821
861
|
}
|
|
822
862
|
};
|
|
@@ -976,8 +1016,184 @@ var init_keyboard = __esm({
|
|
|
976
1016
|
}
|
|
977
1017
|
});
|
|
978
1018
|
|
|
979
|
-
// src/tools/windows/
|
|
1019
|
+
// src/tools/windows/clipboard.ts
|
|
980
1020
|
import { z as z5 } from "zod";
|
|
1021
|
+
async function getNutJs3() {
|
|
1022
|
+
return import("@nut-tree-fork/nut-js");
|
|
1023
|
+
}
|
|
1024
|
+
var clipboardTypeTool, switchInputMethodTool;
|
|
1025
|
+
var init_clipboard = __esm({
|
|
1026
|
+
"src/tools/windows/clipboard.ts"() {
|
|
1027
|
+
"use strict";
|
|
1028
|
+
clipboardTypeTool = {
|
|
1029
|
+
name: "clipboard_type",
|
|
1030
|
+
description: "Type text by copying it to the clipboard and pasting (Ctrl+V). This bypasses input method (IME) issues and is faster than keyboard_type. Use this when the current IME might interfere, or for non-ASCII text (Chinese, Japanese, etc.).",
|
|
1031
|
+
parameters: z5.object({
|
|
1032
|
+
text: z5.string().describe("The text to paste")
|
|
1033
|
+
}),
|
|
1034
|
+
async execute(args) {
|
|
1035
|
+
const nut = await getNutJs3();
|
|
1036
|
+
await nut.clipboard.setContent(args.text);
|
|
1037
|
+
await nut.keyboard.pressKey(nut.Key.LeftControl, nut.Key.V);
|
|
1038
|
+
await nut.keyboard.releaseKey(nut.Key.LeftControl, nut.Key.V);
|
|
1039
|
+
return { type: "text", content: `Pasted: "${args.text}"` };
|
|
1040
|
+
}
|
|
1041
|
+
};
|
|
1042
|
+
switchInputMethodTool = {
|
|
1043
|
+
name: "switch_input_method",
|
|
1044
|
+
description: "Toggle the input method (IME) by pressing Win+Space. Use this before keyboard_type if the current IME is wrong. Take a screenshot afterward to verify the switch.",
|
|
1045
|
+
parameters: z5.object({}),
|
|
1046
|
+
async execute() {
|
|
1047
|
+
const nut = await getNutJs3();
|
|
1048
|
+
await nut.keyboard.pressKey(nut.Key.LeftWin, nut.Key.Space);
|
|
1049
|
+
await nut.keyboard.releaseKey(nut.Key.LeftWin, nut.Key.Space);
|
|
1050
|
+
return { type: "text", content: "Toggled input method (Win+Space)" };
|
|
1051
|
+
}
|
|
1052
|
+
};
|
|
1053
|
+
}
|
|
1054
|
+
});
|
|
1055
|
+
|
|
1056
|
+
// src/tools/windows/window.ts
|
|
1057
|
+
import { z as z6 } from "zod";
|
|
1058
|
+
import sharp3 from "sharp";
|
|
1059
|
+
async function getNutJs4() {
|
|
1060
|
+
return import("@nut-tree-fork/nut-js");
|
|
1061
|
+
}
|
|
1062
|
+
async function getNodeScreenshots() {
|
|
1063
|
+
return import("node-screenshots");
|
|
1064
|
+
}
|
|
1065
|
+
function findWindowByTitle(windows, title) {
|
|
1066
|
+
const lower = title.toLowerCase();
|
|
1067
|
+
const exact = windows.find(
|
|
1068
|
+
(w) => w.title().toLowerCase() === lower
|
|
1069
|
+
);
|
|
1070
|
+
if (exact) return exact;
|
|
1071
|
+
return windows.find(
|
|
1072
|
+
(w) => w.title().toLowerCase().includes(lower)
|
|
1073
|
+
);
|
|
1074
|
+
}
|
|
1075
|
+
var listWindowsTool, focusWindowTool, windowScreenshotTool;
|
|
1076
|
+
var init_window = __esm({
|
|
1077
|
+
"src/tools/windows/window.ts"() {
|
|
1078
|
+
"use strict";
|
|
1079
|
+
init_grid_overlay();
|
|
1080
|
+
listWindowsTool = {
|
|
1081
|
+
name: "list_windows",
|
|
1082
|
+
description: "List all visible windows with their titles, positions, and sizes.",
|
|
1083
|
+
parameters: z6.object({}),
|
|
1084
|
+
async execute() {
|
|
1085
|
+
const { Window } = await getNodeScreenshots();
|
|
1086
|
+
const windows = Window.all();
|
|
1087
|
+
const list = windows.filter((w) => w.title().trim().length > 0).map((w) => ({
|
|
1088
|
+
id: w.id(),
|
|
1089
|
+
title: w.title(),
|
|
1090
|
+
appName: w.appName(),
|
|
1091
|
+
x: w.x(),
|
|
1092
|
+
y: w.y(),
|
|
1093
|
+
width: w.width(),
|
|
1094
|
+
height: w.height(),
|
|
1095
|
+
isMinimized: w.isMinimized(),
|
|
1096
|
+
isFocused: w.isFocused()
|
|
1097
|
+
}));
|
|
1098
|
+
const formatted = list.map(
|
|
1099
|
+
(w) => `[${w.isFocused ? "*" : " "}] "${w.title}" (${w.appName}) \u2014 pos:(${w.x},${w.y}) size:${w.width}x${w.height}${w.isMinimized ? " [minimized]" : ""}`
|
|
1100
|
+
).join("\n");
|
|
1101
|
+
return {
|
|
1102
|
+
type: "text",
|
|
1103
|
+
content: `Found ${list.length} windows:
|
|
1104
|
+
${formatted}`
|
|
1105
|
+
};
|
|
1106
|
+
}
|
|
1107
|
+
};
|
|
1108
|
+
focusWindowTool = {
|
|
1109
|
+
name: "focus_window",
|
|
1110
|
+
description: "Focus (activate) a window by its title. Uses partial, case-insensitive matching.",
|
|
1111
|
+
parameters: z6.object({
|
|
1112
|
+
title: z6.string().describe("Window title to search for (partial match)")
|
|
1113
|
+
}),
|
|
1114
|
+
async execute(args) {
|
|
1115
|
+
const nut = await getNutJs4();
|
|
1116
|
+
const windows = await nut.getWindows();
|
|
1117
|
+
const lower = args.title.toLowerCase();
|
|
1118
|
+
let target = null;
|
|
1119
|
+
for (const w of windows) {
|
|
1120
|
+
const t = await w.title;
|
|
1121
|
+
if (t.toLowerCase() === lower) {
|
|
1122
|
+
target = w;
|
|
1123
|
+
break;
|
|
1124
|
+
}
|
|
1125
|
+
if (!target && t.toLowerCase().includes(lower)) {
|
|
1126
|
+
target = w;
|
|
1127
|
+
}
|
|
1128
|
+
}
|
|
1129
|
+
if (!target) {
|
|
1130
|
+
return {
|
|
1131
|
+
type: "text",
|
|
1132
|
+
content: `Error: No window found matching "${args.title}"`
|
|
1133
|
+
};
|
|
1134
|
+
}
|
|
1135
|
+
const title = await target.title;
|
|
1136
|
+
await target.focus();
|
|
1137
|
+
return { type: "text", content: `Focused window: "${title}"` };
|
|
1138
|
+
}
|
|
1139
|
+
};
|
|
1140
|
+
windowScreenshotTool = {
|
|
1141
|
+
name: "window_screenshot",
|
|
1142
|
+
description: "Capture a screenshot of a specific window by its title. The coordinate grid shows screen-absolute coordinates (matching mouse_click). Returns a screenshot ID.",
|
|
1143
|
+
parameters: z6.object({
|
|
1144
|
+
title: z6.string().describe("Window title to search for (partial match)")
|
|
1145
|
+
}),
|
|
1146
|
+
async execute(args, ctx) {
|
|
1147
|
+
const { Window, Monitor } = await getNodeScreenshots();
|
|
1148
|
+
const windows = Window.all().filter(
|
|
1149
|
+
(w) => w.title().trim().length > 0
|
|
1150
|
+
);
|
|
1151
|
+
const target = findWindowByTitle(windows, args.title);
|
|
1152
|
+
if (!target) {
|
|
1153
|
+
return {
|
|
1154
|
+
type: "text",
|
|
1155
|
+
content: `Error: No window found matching "${args.title}"`
|
|
1156
|
+
};
|
|
1157
|
+
}
|
|
1158
|
+
const winTitle = target.title();
|
|
1159
|
+
const winX = target.x();
|
|
1160
|
+
const winY = target.y();
|
|
1161
|
+
const image = target.captureImageSync();
|
|
1162
|
+
const physW = image.width;
|
|
1163
|
+
const physH = image.height;
|
|
1164
|
+
const monitor = target.currentMonitor();
|
|
1165
|
+
const scaleFactor = monitor ? monitor.scaleFactor() : 1;
|
|
1166
|
+
const logicalW = Math.round(physW / scaleFactor);
|
|
1167
|
+
const logicalH = Math.round(physH / scaleFactor);
|
|
1168
|
+
const raw = image.toRawSync();
|
|
1169
|
+
const resized = await sharp3(raw, {
|
|
1170
|
+
raw: { width: physW, height: physH, channels: 4 }
|
|
1171
|
+
}).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
|
|
1172
|
+
const cleanBase64 = resized.toString("base64");
|
|
1173
|
+
const id = ctx.screenshots.save(
|
|
1174
|
+
cleanBase64,
|
|
1175
|
+
"image/jpeg",
|
|
1176
|
+
`window: ${winTitle}`
|
|
1177
|
+
);
|
|
1178
|
+
const { image: gridImage, gridRef } = await addCoordinateGrid(resized, logicalW, logicalH, {
|
|
1179
|
+
offsetX: winX,
|
|
1180
|
+
offsetY: winY
|
|
1181
|
+
});
|
|
1182
|
+
const gridBase64 = gridImage.toString("base64");
|
|
1183
|
+
return {
|
|
1184
|
+
type: "image",
|
|
1185
|
+
base64: gridBase64,
|
|
1186
|
+
mimeType: "image/jpeg",
|
|
1187
|
+
screenshotId: id,
|
|
1188
|
+
content: gridRef
|
|
1189
|
+
};
|
|
1190
|
+
}
|
|
1191
|
+
};
|
|
1192
|
+
}
|
|
1193
|
+
});
|
|
1194
|
+
|
|
1195
|
+
// src/tools/windows/command.ts
|
|
1196
|
+
import { z as z7 } from "zod";
|
|
981
1197
|
import { exec } from "child_process";
|
|
982
1198
|
var MAX_OUTPUT_LENGTH, runCommandTool;
|
|
983
1199
|
var init_command = __esm({
|
|
@@ -987,9 +1203,9 @@ var init_command = __esm({
|
|
|
987
1203
|
runCommandTool = {
|
|
988
1204
|
name: "run_command",
|
|
989
1205
|
description: "Execute a shell command and return its output. Uses PowerShell on Windows.",
|
|
990
|
-
parameters:
|
|
991
|
-
command:
|
|
992
|
-
timeout:
|
|
1206
|
+
parameters: z7.object({
|
|
1207
|
+
command: z7.string().describe("The command to execute"),
|
|
1208
|
+
timeout: z7.number().positive().default(3e4).describe("Timeout in milliseconds")
|
|
993
1209
|
}),
|
|
994
1210
|
async execute(args) {
|
|
995
1211
|
return new Promise((resolve) => {
|
|
@@ -1022,7 +1238,7 @@ var init_command = __esm({
|
|
|
1022
1238
|
});
|
|
1023
1239
|
|
|
1024
1240
|
// src/tools/file/read.ts
|
|
1025
|
-
import { z as
|
|
1241
|
+
import { z as z8 } from "zod";
|
|
1026
1242
|
import { readFile } from "fs/promises";
|
|
1027
1243
|
var MAX_FILE_SIZE, fileReadTool;
|
|
1028
1244
|
var init_read = __esm({
|
|
@@ -1032,8 +1248,8 @@ var init_read = __esm({
|
|
|
1032
1248
|
fileReadTool = {
|
|
1033
1249
|
name: "file_read",
|
|
1034
1250
|
description: "Read the contents of a file at the given path.",
|
|
1035
|
-
parameters:
|
|
1036
|
-
path:
|
|
1251
|
+
parameters: z8.object({
|
|
1252
|
+
path: z8.string().describe("Absolute path to the file")
|
|
1037
1253
|
}),
|
|
1038
1254
|
async execute(args) {
|
|
1039
1255
|
try {
|
|
@@ -1055,7 +1271,7 @@ var init_read = __esm({
|
|
|
1055
1271
|
});
|
|
1056
1272
|
|
|
1057
1273
|
// src/tools/file/write.ts
|
|
1058
|
-
import { z as
|
|
1274
|
+
import { z as z9 } from "zod";
|
|
1059
1275
|
import { writeFile, mkdir } from "fs/promises";
|
|
1060
1276
|
import { dirname } from "path";
|
|
1061
1277
|
var fileWriteTool;
|
|
@@ -1065,9 +1281,9 @@ var init_write = __esm({
|
|
|
1065
1281
|
fileWriteTool = {
|
|
1066
1282
|
name: "file_write",
|
|
1067
1283
|
description: "Write content to a file at the given path. Creates parent directories if needed.",
|
|
1068
|
-
parameters:
|
|
1069
|
-
path:
|
|
1070
|
-
content:
|
|
1284
|
+
parameters: z9.object({
|
|
1285
|
+
path: z9.string().describe("Absolute path to the file"),
|
|
1286
|
+
content: z9.string().describe("Content to write")
|
|
1071
1287
|
}),
|
|
1072
1288
|
async execute(args) {
|
|
1073
1289
|
try {
|
|
@@ -1084,7 +1300,7 @@ var init_write = __esm({
|
|
|
1084
1300
|
});
|
|
1085
1301
|
|
|
1086
1302
|
// src/tools/file/image.ts
|
|
1087
|
-
import { z as
|
|
1303
|
+
import { z as z10 } from "zod";
|
|
1088
1304
|
import { readFileSync as readFileSync2, existsSync as existsSync3 } from "fs";
|
|
1089
1305
|
import { extname } from "path";
|
|
1090
1306
|
var IMAGE_EXTS, useLocalImageTool;
|
|
@@ -1095,9 +1311,9 @@ var init_image = __esm({
|
|
|
1095
1311
|
useLocalImageTool = {
|
|
1096
1312
|
name: "use_local_image",
|
|
1097
1313
|
description: "Load a local image file and get a screenshot ID for it. Use this to reference local images in your report via [Image:img_X].",
|
|
1098
|
-
parameters:
|
|
1099
|
-
path:
|
|
1100
|
-
label:
|
|
1314
|
+
parameters: z10.object({
|
|
1315
|
+
path: z10.string().describe("Absolute path to the image file"),
|
|
1316
|
+
label: z10.string().default("local").describe('Label for the image (e.g. "chart", "photo")')
|
|
1101
1317
|
}),
|
|
1102
1318
|
async execute(args, ctx) {
|
|
1103
1319
|
if (!existsSync3(args.path)) {
|
|
@@ -1123,7 +1339,7 @@ var init_image = __esm({
|
|
|
1123
1339
|
});
|
|
1124
1340
|
|
|
1125
1341
|
// src/tools/browser/navigate.ts
|
|
1126
|
-
import { z as
|
|
1342
|
+
import { z as z11 } from "zod";
|
|
1127
1343
|
var browserNavigateTool;
|
|
1128
1344
|
var init_navigate = __esm({
|
|
1129
1345
|
"src/tools/browser/navigate.ts"() {
|
|
@@ -1131,8 +1347,8 @@ var init_navigate = __esm({
|
|
|
1131
1347
|
browserNavigateTool = {
|
|
1132
1348
|
name: "browser_navigate",
|
|
1133
1349
|
description: "Navigate the browser to a URL.",
|
|
1134
|
-
parameters:
|
|
1135
|
-
url:
|
|
1350
|
+
parameters: z11.object({
|
|
1351
|
+
url: z11.string().describe("The URL to navigate to")
|
|
1136
1352
|
}),
|
|
1137
1353
|
async execute(args, ctx) {
|
|
1138
1354
|
const browser = await ctx.getBrowser();
|
|
@@ -1147,7 +1363,7 @@ Page title: ${title}` };
|
|
|
1147
1363
|
});
|
|
1148
1364
|
|
|
1149
1365
|
// src/tools/browser/click.ts
|
|
1150
|
-
import { z as
|
|
1366
|
+
import { z as z12 } from "zod";
|
|
1151
1367
|
var browserClickTool;
|
|
1152
1368
|
var init_click = __esm({
|
|
1153
1369
|
"src/tools/browser/click.ts"() {
|
|
@@ -1155,8 +1371,8 @@ var init_click = __esm({
|
|
|
1155
1371
|
browserClickTool = {
|
|
1156
1372
|
name: "browser_click",
|
|
1157
1373
|
description: "Click an element on the web page using a CSS selector or text content.",
|
|
1158
|
-
parameters:
|
|
1159
|
-
selector:
|
|
1374
|
+
parameters: z12.object({
|
|
1375
|
+
selector: z12.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
|
|
1160
1376
|
}),
|
|
1161
1377
|
async execute(args, ctx) {
|
|
1162
1378
|
const browser = await ctx.getBrowser();
|
|
@@ -1169,7 +1385,7 @@ var init_click = __esm({
|
|
|
1169
1385
|
});
|
|
1170
1386
|
|
|
1171
1387
|
// src/tools/browser/type.ts
|
|
1172
|
-
import { z as
|
|
1388
|
+
import { z as z13 } from "zod";
|
|
1173
1389
|
var browserTypeTool;
|
|
1174
1390
|
var init_type = __esm({
|
|
1175
1391
|
"src/tools/browser/type.ts"() {
|
|
@@ -1177,10 +1393,10 @@ var init_type = __esm({
|
|
|
1177
1393
|
browserTypeTool = {
|
|
1178
1394
|
name: "browser_type",
|
|
1179
1395
|
description: "Type text into an input field on the web page.",
|
|
1180
|
-
parameters:
|
|
1181
|
-
selector:
|
|
1182
|
-
text:
|
|
1183
|
-
clear:
|
|
1396
|
+
parameters: z13.object({
|
|
1397
|
+
selector: z13.string().describe("CSS selector for the input element"),
|
|
1398
|
+
text: z13.string().describe("Text to type"),
|
|
1399
|
+
clear: z13.boolean().default(true).describe("Whether to clear the field before typing")
|
|
1184
1400
|
}),
|
|
1185
1401
|
async execute(args, ctx) {
|
|
1186
1402
|
const browser = await ctx.getBrowser();
|
|
@@ -1197,7 +1413,7 @@ var init_type = __esm({
|
|
|
1197
1413
|
});
|
|
1198
1414
|
|
|
1199
1415
|
// src/tools/browser/screenshot.ts
|
|
1200
|
-
import { z as
|
|
1416
|
+
import { z as z14 } from "zod";
|
|
1201
1417
|
var browserScreenshotTool;
|
|
1202
1418
|
var init_screenshot2 = __esm({
|
|
1203
1419
|
"src/tools/browser/screenshot.ts"() {
|
|
@@ -1205,8 +1421,8 @@ var init_screenshot2 = __esm({
|
|
|
1205
1421
|
browserScreenshotTool = {
|
|
1206
1422
|
name: "browser_screenshot",
|
|
1207
1423
|
description: "Take a screenshot of the current browser page. Returns a screenshot ID (e.g. img_2) that you can reference later in report.",
|
|
1208
|
-
parameters:
|
|
1209
|
-
fullPage:
|
|
1424
|
+
parameters: z14.object({
|
|
1425
|
+
fullPage: z14.boolean().default(false).describe("Whether to capture the full scrollable page")
|
|
1210
1426
|
}),
|
|
1211
1427
|
async execute(args, ctx) {
|
|
1212
1428
|
const browser = await ctx.getBrowser();
|
|
@@ -1231,7 +1447,7 @@ var init_screenshot2 = __esm({
|
|
|
1231
1447
|
});
|
|
1232
1448
|
|
|
1233
1449
|
// src/tools/browser/content.ts
|
|
1234
|
-
import { z as
|
|
1450
|
+
import { z as z15 } from "zod";
|
|
1235
1451
|
var MAX_CONTENT_LENGTH, browserContentTool;
|
|
1236
1452
|
var init_content = __esm({
|
|
1237
1453
|
"src/tools/browser/content.ts"() {
|
|
@@ -1240,7 +1456,7 @@ var init_content = __esm({
|
|
|
1240
1456
|
browserContentTool = {
|
|
1241
1457
|
name: "browser_content",
|
|
1242
1458
|
description: "Get the text content of the current web page. Returns visible text, not HTML.",
|
|
1243
|
-
parameters:
|
|
1459
|
+
parameters: z15.object({}),
|
|
1244
1460
|
async execute(_args, ctx) {
|
|
1245
1461
|
const browser = await ctx.getBrowser();
|
|
1246
1462
|
const page = await browser.getPage();
|
|
@@ -1263,7 +1479,7 @@ ${text}`
|
|
|
1263
1479
|
});
|
|
1264
1480
|
|
|
1265
1481
|
// src/tools/browser/scroll.ts
|
|
1266
|
-
import { z as
|
|
1482
|
+
import { z as z16 } from "zod";
|
|
1267
1483
|
var browserScrollTool;
|
|
1268
1484
|
var init_scroll = __esm({
|
|
1269
1485
|
"src/tools/browser/scroll.ts"() {
|
|
@@ -1271,9 +1487,9 @@ var init_scroll = __esm({
|
|
|
1271
1487
|
browserScrollTool = {
|
|
1272
1488
|
name: "browser_scroll",
|
|
1273
1489
|
description: "Scroll the current web page.",
|
|
1274
|
-
parameters:
|
|
1275
|
-
direction:
|
|
1276
|
-
amount:
|
|
1490
|
+
parameters: z16.object({
|
|
1491
|
+
direction: z16.enum(["up", "down"]).describe("Scroll direction"),
|
|
1492
|
+
amount: z16.number().positive().default(500).describe("Pixels to scroll")
|
|
1277
1493
|
}),
|
|
1278
1494
|
async execute(args, ctx) {
|
|
1279
1495
|
const browser = await ctx.getBrowser();
|
|
@@ -1287,7 +1503,7 @@ var init_scroll = __esm({
|
|
|
1287
1503
|
});
|
|
1288
1504
|
|
|
1289
1505
|
// src/tools/control/report.ts
|
|
1290
|
-
import { z as
|
|
1506
|
+
import { z as z17 } from "zod";
|
|
1291
1507
|
var reportTool;
|
|
1292
1508
|
var init_report = __esm({
|
|
1293
1509
|
"src/tools/control/report.ts"() {
|
|
@@ -1295,12 +1511,12 @@ var init_report = __esm({
|
|
|
1295
1511
|
reportTool = {
|
|
1296
1512
|
name: "report",
|
|
1297
1513
|
description: 'Report progress back to the caller. Call this when the task is completed, when you are blocked, or when you need guidance. Calling this STOPS your execution immediately.\n\nThe content field supports rich document format: mix text with screenshots using [Image:img_1] markers. Example:\n"Here is the current state:\n[Image:img_2]\nThe page shows..."',
|
|
1298
|
-
parameters:
|
|
1299
|
-
status:
|
|
1514
|
+
parameters: z17.object({
|
|
1515
|
+
status: z17.enum(["completed", "blocked", "need_guidance"]).describe(
|
|
1300
1516
|
'"completed" = task done, "blocked" = cannot proceed, "need_guidance" = need a decision'
|
|
1301
1517
|
),
|
|
1302
|
-
content:
|
|
1303
|
-
data:
|
|
1518
|
+
content: z17.string().describe('Rich report content. Use [Image:img_X] to embed screenshots captured earlier. Example: "Task done.\\n[Image:img_1]\\nThe page shows the result."'),
|
|
1519
|
+
data: z17.unknown().optional().describe("Optional structured data to return")
|
|
1304
1520
|
}),
|
|
1305
1521
|
async execute(args) {
|
|
1306
1522
|
return {
|
|
@@ -1323,6 +1539,11 @@ function createToolRegistry() {
|
|
|
1323
1539
|
registry2.register(mouseScrollTool);
|
|
1324
1540
|
registry2.register(keyboardTypeTool);
|
|
1325
1541
|
registry2.register(keyboardPressTool);
|
|
1542
|
+
registry2.register(clipboardTypeTool);
|
|
1543
|
+
registry2.register(switchInputMethodTool);
|
|
1544
|
+
registry2.register(listWindowsTool);
|
|
1545
|
+
registry2.register(focusWindowTool);
|
|
1546
|
+
registry2.register(windowScreenshotTool);
|
|
1326
1547
|
registry2.register(runCommandTool);
|
|
1327
1548
|
registry2.register(fileReadTool);
|
|
1328
1549
|
registry2.register(fileWriteTool);
|
|
@@ -1343,6 +1564,8 @@ var init_tools = __esm({
|
|
|
1343
1564
|
init_screenshot();
|
|
1344
1565
|
init_mouse();
|
|
1345
1566
|
init_keyboard();
|
|
1567
|
+
init_clipboard();
|
|
1568
|
+
init_window();
|
|
1346
1569
|
init_command();
|
|
1347
1570
|
init_read();
|
|
1348
1571
|
init_write();
|
|
@@ -1494,19 +1717,19 @@ var init_session_registry = __esm({
|
|
|
1494
1717
|
});
|
|
1495
1718
|
|
|
1496
1719
|
// src/mcp/tools.ts
|
|
1497
|
-
import { z as
|
|
1720
|
+
import { z as z18 } from "zod";
|
|
1498
1721
|
function registerMcpTools(server2, registry2) {
|
|
1499
1722
|
server2.tool(
|
|
1500
1723
|
"create_session",
|
|
1501
1724
|
"Create a new automation session with a small LLM agent. Returns a session_id.",
|
|
1502
1725
|
{
|
|
1503
|
-
api_key:
|
|
1504
|
-
base_url:
|
|
1505
|
-
model:
|
|
1506
|
-
cdp_url:
|
|
1507
|
-
timeout_ms:
|
|
1508
|
-
max_steps:
|
|
1509
|
-
max_rounds:
|
|
1726
|
+
api_key: z18.string().optional().describe("LLM API key (or set WINDOWS_USE_API_KEY env)"),
|
|
1727
|
+
base_url: z18.string().optional().describe("OpenAI-compatible base URL (or set WINDOWS_USE_BASE_URL env)"),
|
|
1728
|
+
model: z18.string().optional().describe("Model name (or set WINDOWS_USE_MODEL env)"),
|
|
1729
|
+
cdp_url: z18.string().optional().describe("Chrome CDP URL (default: http://localhost:9222)"),
|
|
1730
|
+
timeout_ms: z18.number().optional().describe("Session inactivity timeout in ms (default: 300000)"),
|
|
1731
|
+
max_steps: z18.number().optional().describe("Max tool-calling steps per instruction (default: 50)"),
|
|
1732
|
+
max_rounds: z18.number().optional().describe("Max instruction rounds per session (default: 20)")
|
|
1510
1733
|
},
|
|
1511
1734
|
async (args) => {
|
|
1512
1735
|
const config = loadConfig({
|
|
@@ -1533,8 +1756,8 @@ function registerMcpTools(server2, registry2) {
|
|
|
1533
1756
|
"send_instruction",
|
|
1534
1757
|
"Send a task instruction to the agent in a session. The agent executes it and returns a rich report with text and images.",
|
|
1535
1758
|
{
|
|
1536
|
-
session_id:
|
|
1537
|
-
instruction:
|
|
1759
|
+
session_id: z18.string().describe("Session ID from create_session"),
|
|
1760
|
+
instruction: z18.string().describe("What you want the agent to do, in natural language")
|
|
1538
1761
|
},
|
|
1539
1762
|
async (args) => {
|
|
1540
1763
|
const session = registry2.get(args.session_id);
|
|
@@ -1584,7 +1807,7 @@ function registerMcpTools(server2, registry2) {
|
|
|
1584
1807
|
"done_session",
|
|
1585
1808
|
"Terminate a session and free all resources.",
|
|
1586
1809
|
{
|
|
1587
|
-
session_id:
|
|
1810
|
+
session_id: z18.string().describe("Session ID to terminate")
|
|
1588
1811
|
},
|
|
1589
1812
|
async (args) => {
|
|
1590
1813
|
await registry2.destroy(args.session_id);
|