windows-use 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +188 -70
- package/dist/cli.js +689 -129
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +88 -16
- package/dist/index.js +467 -88
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +504 -124
- package/dist/mcp/server.js.map +1 -1
- package/package.json +10 -1
package/dist/mcp/server.js
CHANGED
|
@@ -8,23 +8,14 @@ import crypto from "crypto";
|
|
|
8
8
|
// src/agent/context-manager.ts
|
|
9
9
|
var ContextManager = class {
|
|
10
10
|
messages = [];
|
|
11
|
-
maxMessages;
|
|
12
|
-
constructor(maxMessages) {
|
|
13
|
-
this.maxMessages = maxMessages;
|
|
14
|
-
}
|
|
15
11
|
append(message) {
|
|
16
12
|
this.messages.push(message);
|
|
17
13
|
}
|
|
18
|
-
/** Returns
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
const systemPrompt = this.messages[0]?.role === "system" ? this.messages[0] : null;
|
|
22
|
-
const nonSystem = systemPrompt ? this.messages.slice(1) : this.messages;
|
|
23
|
-
const windowSize = this.maxMessages - (systemPrompt ? 1 : 0);
|
|
24
|
-
const windowed = nonSystem.length > windowSize ? nonSystem.slice(-windowSize) : nonSystem;
|
|
25
|
-
return systemPrompt ? [systemPrompt, ...windowed] : windowed;
|
|
14
|
+
/** Returns all messages. */
|
|
15
|
+
getMessages() {
|
|
16
|
+
return [...this.messages];
|
|
26
17
|
}
|
|
27
|
-
/** Total messages stored
|
|
18
|
+
/** Total messages stored. */
|
|
28
19
|
get length() {
|
|
29
20
|
return this.messages.length;
|
|
30
21
|
}
|
|
@@ -78,7 +69,16 @@ Call \`report\` when:
|
|
|
78
69
|
- **"blocked"**: You cannot proceed (CAPTCHA, login wall, unexpected error). Explain what's blocking you.
|
|
79
70
|
- **"need_guidance"**: You need a decision or clarification. Describe what you need.
|
|
80
71
|
|
|
81
|
-
Calling \`report\` stops your execution.
|
|
72
|
+
Calling \`report\` stops your execution. The \`content\` field supports a rich document format \u2014 mix text with screenshots using \`[Image:img_X]\` markers:
|
|
73
|
+
|
|
74
|
+
\`\`\`
|
|
75
|
+
report({
|
|
76
|
+
status: "completed",
|
|
77
|
+
content: "Here is what I found:\\n[Image:img_2]\\nThe page shows the search results.\\n[Image:img_3]\\nI also checked the sidebar."
|
|
78
|
+
})
|
|
79
|
+
\`\`\`
|
|
80
|
+
|
|
81
|
+
Each screenshot tool returns a screenshot ID (e.g. img_1, img_2). Use these IDs to embed images in your report.
|
|
82
82
|
|
|
83
83
|
## Important
|
|
84
84
|
- Do NOT keep retrying the same failing action. If something fails twice, call \`report\` with status "blocked".
|
|
@@ -94,6 +94,8 @@ var AgentRunner = class {
|
|
|
94
94
|
config;
|
|
95
95
|
toolContext;
|
|
96
96
|
initialized = false;
|
|
97
|
+
onStep = null;
|
|
98
|
+
roundsUsed = 0;
|
|
97
99
|
constructor(llmClient, contextManager, toolRegistry, config, toolContext) {
|
|
98
100
|
this.llmClient = llmClient;
|
|
99
101
|
this.contextManager = contextManager;
|
|
@@ -101,7 +103,30 @@ var AgentRunner = class {
|
|
|
101
103
|
this.config = config;
|
|
102
104
|
this.toolContext = toolContext;
|
|
103
105
|
}
|
|
106
|
+
/** Register a callback to receive step-by-step progress events */
|
|
107
|
+
setOnStep(cb) {
|
|
108
|
+
this.onStep = cb;
|
|
109
|
+
}
|
|
110
|
+
emit(event) {
|
|
111
|
+
this.onStep?.(event);
|
|
112
|
+
}
|
|
113
|
+
/** How many instruction rounds have been used in this session */
|
|
114
|
+
get currentRound() {
|
|
115
|
+
return this.roundsUsed;
|
|
116
|
+
}
|
|
117
|
+
/** Whether this session has exhausted its max rounds */
|
|
118
|
+
get roundsExhausted() {
|
|
119
|
+
return this.roundsUsed >= this.config.maxRounds;
|
|
120
|
+
}
|
|
104
121
|
async run(instruction) {
|
|
122
|
+
if (this.roundsExhausted) {
|
|
123
|
+
return {
|
|
124
|
+
status: "blocked",
|
|
125
|
+
content: `Session has reached the maximum number of instruction rounds (${this.config.maxRounds}). Create a new session to continue.`,
|
|
126
|
+
stepsUsed: 0
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
this.roundsUsed++;
|
|
105
130
|
if (!this.initialized) {
|
|
106
131
|
this.contextManager.append({
|
|
107
132
|
role: "system",
|
|
@@ -117,7 +142,7 @@ var AgentRunner = class {
|
|
|
117
142
|
while (stepsUsed < this.config.maxSteps) {
|
|
118
143
|
stepsUsed++;
|
|
119
144
|
const remaining = this.config.maxSteps - stepsUsed;
|
|
120
|
-
const messages = this.contextManager.
|
|
145
|
+
const messages = this.contextManager.getMessages();
|
|
121
146
|
if (remaining <= 3 && remaining >= 0) {
|
|
122
147
|
messages.push({
|
|
123
148
|
role: "system",
|
|
@@ -130,9 +155,10 @@ var AgentRunner = class {
|
|
|
130
155
|
response = await this.llmClient.chat(messages, tools);
|
|
131
156
|
} catch (err) {
|
|
132
157
|
const msg = err instanceof Error ? err.message : String(err);
|
|
158
|
+
this.emit({ type: "error", step: stepsUsed, message: `LLM API error: ${msg}` });
|
|
133
159
|
return {
|
|
134
160
|
status: "blocked",
|
|
135
|
-
|
|
161
|
+
content: `LLM API error: ${msg}`,
|
|
136
162
|
stepsUsed
|
|
137
163
|
};
|
|
138
164
|
}
|
|
@@ -140,26 +166,31 @@ var AgentRunner = class {
|
|
|
140
166
|
if (!choice) {
|
|
141
167
|
return {
|
|
142
168
|
status: "blocked",
|
|
143
|
-
|
|
169
|
+
content: "LLM returned empty response",
|
|
144
170
|
stepsUsed
|
|
145
171
|
};
|
|
146
172
|
}
|
|
147
173
|
const message = choice.message;
|
|
174
|
+
if (message.content) {
|
|
175
|
+
this.emit({ type: "thinking", step: stepsUsed, content: message.content });
|
|
176
|
+
}
|
|
148
177
|
if (choice.finish_reason === "stop" || !message.tool_calls?.length) {
|
|
149
178
|
const text = message.content ?? "";
|
|
150
179
|
this.contextManager.append({ role: "assistant", content: text });
|
|
151
180
|
return {
|
|
152
181
|
status: "need_guidance",
|
|
153
|
-
|
|
182
|
+
content: text || "Agent stopped without calling report.",
|
|
154
183
|
stepsUsed
|
|
155
184
|
};
|
|
156
185
|
}
|
|
157
186
|
this.contextManager.append(message);
|
|
158
187
|
for (const toolCall of message.tool_calls) {
|
|
188
|
+
const toolName = toolCall.function.name;
|
|
159
189
|
let args;
|
|
160
190
|
try {
|
|
161
191
|
args = JSON.parse(toolCall.function.arguments);
|
|
162
192
|
} catch {
|
|
193
|
+
this.emit({ type: "error", step: stepsUsed, message: `Failed to parse args for ${toolName}` });
|
|
163
194
|
this.contextManager.append({
|
|
164
195
|
role: "tool",
|
|
165
196
|
tool_call_id: toolCall.id,
|
|
@@ -167,15 +198,17 @@ var AgentRunner = class {
|
|
|
167
198
|
});
|
|
168
199
|
continue;
|
|
169
200
|
}
|
|
201
|
+
this.emit({ type: "tool_call", step: stepsUsed, name: toolName, args });
|
|
170
202
|
let result;
|
|
171
203
|
try {
|
|
172
204
|
result = await this.toolRegistry.execute(
|
|
173
|
-
|
|
205
|
+
toolName,
|
|
174
206
|
args,
|
|
175
207
|
this.toolContext
|
|
176
208
|
);
|
|
177
209
|
} catch (err) {
|
|
178
210
|
const msg = err instanceof Error ? err.message : String(err);
|
|
211
|
+
this.emit({ type: "error", step: stepsUsed, message: `${toolName} failed: ${msg}` });
|
|
179
212
|
this.contextManager.append({
|
|
180
213
|
role: "tool",
|
|
181
214
|
tool_call_id: toolCall.id,
|
|
@@ -184,6 +217,7 @@ var AgentRunner = class {
|
|
|
184
217
|
continue;
|
|
185
218
|
}
|
|
186
219
|
if (result.type === "report") {
|
|
220
|
+
this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: `[${result.status}] report submitted` });
|
|
187
221
|
this.contextManager.append({
|
|
188
222
|
role: "tool",
|
|
189
223
|
tool_call_id: toolCall.id,
|
|
@@ -191,18 +225,18 @@ var AgentRunner = class {
|
|
|
191
225
|
});
|
|
192
226
|
return {
|
|
193
227
|
status: result.status,
|
|
194
|
-
|
|
195
|
-
screenshot: result.screenshot,
|
|
228
|
+
content: result.content,
|
|
196
229
|
data: result.data,
|
|
197
230
|
stepsUsed
|
|
198
231
|
};
|
|
199
232
|
}
|
|
200
233
|
if (result.type === "image") {
|
|
234
|
+
this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: `Screenshot captured (${result.screenshotId})` });
|
|
201
235
|
this.contextManager.append({
|
|
202
236
|
role: "tool",
|
|
203
237
|
tool_call_id: toolCall.id,
|
|
204
238
|
content: [
|
|
205
|
-
{ type: "text", text:
|
|
239
|
+
{ type: "text", text: `Screenshot captured. ID: ${result.screenshotId}` },
|
|
206
240
|
{
|
|
207
241
|
type: "image_url",
|
|
208
242
|
image_url: {
|
|
@@ -212,6 +246,8 @@ var AgentRunner = class {
|
|
|
212
246
|
]
|
|
213
247
|
});
|
|
214
248
|
} else {
|
|
249
|
+
const preview = result.content.length > 200 ? result.content.slice(0, 200) + "..." : result.content;
|
|
250
|
+
this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: preview });
|
|
215
251
|
this.contextManager.append({
|
|
216
252
|
role: "tool",
|
|
217
253
|
tool_call_id: toolCall.id,
|
|
@@ -222,30 +258,208 @@ var AgentRunner = class {
|
|
|
222
258
|
}
|
|
223
259
|
return {
|
|
224
260
|
status: "blocked",
|
|
225
|
-
|
|
261
|
+
content: `Exceeded maximum steps limit (${this.config.maxSteps}). Task may be incomplete.`,
|
|
226
262
|
stepsUsed
|
|
227
263
|
};
|
|
228
264
|
}
|
|
229
265
|
};
|
|
230
266
|
|
|
231
267
|
// src/tools/browser/client.ts
|
|
268
|
+
import { existsSync, mkdirSync, cpSync, readdirSync } from "fs";
|
|
269
|
+
import { spawn, execSync } from "child_process";
|
|
270
|
+
import { join } from "path";
|
|
271
|
+
import { homedir } from "os";
|
|
272
|
+
var CHROME_PATHS = [
|
|
273
|
+
// Windows
|
|
274
|
+
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
|
|
275
|
+
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
|
|
276
|
+
`${process.env.LOCALAPPDATA ?? ""}\\Google\\Chrome\\Application\\chrome.exe`,
|
|
277
|
+
// macOS
|
|
278
|
+
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
|
279
|
+
// Linux
|
|
280
|
+
"/usr/bin/google-chrome",
|
|
281
|
+
"/usr/bin/google-chrome-stable",
|
|
282
|
+
"/usr/bin/chromium-browser",
|
|
283
|
+
"/usr/bin/chromium"
|
|
284
|
+
];
|
|
285
|
+
var SKIP_DIRS = /* @__PURE__ */ new Set([
|
|
286
|
+
"Cache",
|
|
287
|
+
"Code Cache",
|
|
288
|
+
"GPUCache",
|
|
289
|
+
"Service Worker",
|
|
290
|
+
"CacheStorage",
|
|
291
|
+
"File System",
|
|
292
|
+
"blob_storage",
|
|
293
|
+
"IndexedDB",
|
|
294
|
+
"DawnCache",
|
|
295
|
+
"GrShaderCache",
|
|
296
|
+
"ShaderCache",
|
|
297
|
+
"optimization_guide_model_store",
|
|
298
|
+
"BrowserMetrics",
|
|
299
|
+
"Crashpad",
|
|
300
|
+
"component_crx_cache"
|
|
301
|
+
]);
|
|
302
|
+
function findChrome() {
|
|
303
|
+
for (const p of CHROME_PATHS) {
|
|
304
|
+
if (p && existsSync(p)) return p;
|
|
305
|
+
}
|
|
306
|
+
return null;
|
|
307
|
+
}
|
|
308
|
+
function findUserDataDir() {
|
|
309
|
+
const candidates = [
|
|
310
|
+
// Windows
|
|
311
|
+
join(process.env.LOCALAPPDATA ?? "", "Google", "Chrome", "User Data"),
|
|
312
|
+
// macOS
|
|
313
|
+
join(homedir(), "Library", "Application Support", "Google", "Chrome"),
|
|
314
|
+
// Linux
|
|
315
|
+
join(homedir(), ".config", "google-chrome"),
|
|
316
|
+
join(homedir(), ".config", "chromium")
|
|
317
|
+
];
|
|
318
|
+
for (const p of candidates) {
|
|
319
|
+
if (p && existsSync(p)) return p;
|
|
320
|
+
}
|
|
321
|
+
return null;
|
|
322
|
+
}
|
|
323
|
+
function getCdpPort(cdpUrl) {
|
|
324
|
+
try {
|
|
325
|
+
return parseInt(new URL(cdpUrl).port, 10) || 9222;
|
|
326
|
+
} catch {
|
|
327
|
+
return 9222;
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
function isChromeRunning() {
|
|
331
|
+
try {
|
|
332
|
+
if (process.platform === "win32") {
|
|
333
|
+
const out = execSync('tasklist /FI "IMAGENAME eq chrome.exe" /NH', {
|
|
334
|
+
encoding: "utf-8",
|
|
335
|
+
windowsHide: true
|
|
336
|
+
});
|
|
337
|
+
return out.includes("chrome.exe");
|
|
338
|
+
} else {
|
|
339
|
+
execSync('pgrep -x "chrome|chromium|google-chrome"', { encoding: "utf-8" });
|
|
340
|
+
return true;
|
|
341
|
+
}
|
|
342
|
+
} catch {
|
|
343
|
+
return false;
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
function syncProfile(sourceDir, targetDir) {
|
|
347
|
+
mkdirSync(targetDir, { recursive: true });
|
|
348
|
+
const entries = readdirSync(sourceDir, { withFileTypes: true });
|
|
349
|
+
for (const entry of entries) {
|
|
350
|
+
const src = join(sourceDir, entry.name);
|
|
351
|
+
const dst = join(targetDir, entry.name);
|
|
352
|
+
if (entry.isFile()) {
|
|
353
|
+
try {
|
|
354
|
+
cpSync(src, dst, { force: true });
|
|
355
|
+
} catch {
|
|
356
|
+
}
|
|
357
|
+
} else if (entry.isDirectory()) {
|
|
358
|
+
if (entry.name === "Default" || entry.name.startsWith("Profile ")) {
|
|
359
|
+
syncProfileDir(src, dst);
|
|
360
|
+
} else if (!SKIP_DIRS.has(entry.name)) {
|
|
361
|
+
try {
|
|
362
|
+
cpSync(src, dst, { recursive: true, force: true });
|
|
363
|
+
} catch {
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
function syncProfileDir(sourceDir, targetDir) {
|
|
370
|
+
mkdirSync(targetDir, { recursive: true });
|
|
371
|
+
let entries;
|
|
372
|
+
try {
|
|
373
|
+
entries = readdirSync(sourceDir, { withFileTypes: true });
|
|
374
|
+
} catch {
|
|
375
|
+
return;
|
|
376
|
+
}
|
|
377
|
+
for (const entry of entries) {
|
|
378
|
+
if (SKIP_DIRS.has(entry.name)) continue;
|
|
379
|
+
const src = join(sourceDir, entry.name);
|
|
380
|
+
const dst = join(targetDir, entry.name);
|
|
381
|
+
try {
|
|
382
|
+
if (entry.isFile()) {
|
|
383
|
+
cpSync(src, dst, { force: true });
|
|
384
|
+
} else if (entry.isDirectory()) {
|
|
385
|
+
cpSync(src, dst, { recursive: true, force: true });
|
|
386
|
+
}
|
|
387
|
+
} catch {
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
}
|
|
232
391
|
var BrowserClient = class {
|
|
233
392
|
browser = null;
|
|
234
393
|
context = null;
|
|
235
394
|
_page = null;
|
|
236
395
|
cdpUrl;
|
|
396
|
+
chromeProcess = null;
|
|
237
397
|
constructor(cdpUrl) {
|
|
238
398
|
this.cdpUrl = cdpUrl;
|
|
239
399
|
}
|
|
240
400
|
async connect() {
|
|
241
401
|
if (this.browser) return;
|
|
242
402
|
const { chromium } = await import("playwright");
|
|
243
|
-
|
|
403
|
+
try {
|
|
404
|
+
this.browser = await chromium.connectOverCDP(this.cdpUrl);
|
|
405
|
+
} catch {
|
|
406
|
+
await this.launchChrome();
|
|
407
|
+
this.browser = await chromium.connectOverCDP(this.cdpUrl);
|
|
408
|
+
}
|
|
244
409
|
const contexts = this.browser.contexts();
|
|
245
410
|
this.context = contexts[0] ?? await this.browser.newContext();
|
|
246
411
|
const pages = this.context.pages();
|
|
247
412
|
this._page = pages[0] ?? await this.context.newPage();
|
|
248
413
|
}
|
|
414
|
+
async launchChrome() {
|
|
415
|
+
const chromePath = findChrome();
|
|
416
|
+
if (!chromePath) {
|
|
417
|
+
throw new Error(
|
|
418
|
+
"Chrome not found. Please install Chrome or start it manually with: chrome --remote-debugging-port=9222"
|
|
419
|
+
);
|
|
420
|
+
}
|
|
421
|
+
const port = getCdpPort(this.cdpUrl);
|
|
422
|
+
if (isChromeRunning()) {
|
|
423
|
+
console.error("[windows-use] Chrome is running without CDP. Restarting with --remote-debugging-port...");
|
|
424
|
+
try {
|
|
425
|
+
if (process.platform === "win32") {
|
|
426
|
+
execSync("taskkill /F /IM chrome.exe /T", { windowsHide: true, stdio: "ignore" });
|
|
427
|
+
} else {
|
|
428
|
+
execSync("pkill -f chrome", { stdio: "ignore" });
|
|
429
|
+
}
|
|
430
|
+
} catch {
|
|
431
|
+
}
|
|
432
|
+
await new Promise((r) => setTimeout(r, 1500));
|
|
433
|
+
}
|
|
434
|
+
const targetDir = join(homedir(), ".windows-use", "chrome-profile");
|
|
435
|
+
const userDir = findUserDataDir();
|
|
436
|
+
if (userDir) {
|
|
437
|
+
console.error("[windows-use] Syncing Chrome profile (cookies, login state)...");
|
|
438
|
+
syncProfile(userDir, targetDir);
|
|
439
|
+
console.error("[windows-use] Profile synced.");
|
|
440
|
+
} else {
|
|
441
|
+
mkdirSync(targetDir, { recursive: true });
|
|
442
|
+
}
|
|
443
|
+
console.error(`[windows-use] Launching Chrome with --remote-debugging-port=${port}`);
|
|
444
|
+
this.chromeProcess = spawn(
|
|
445
|
+
chromePath,
|
|
446
|
+
[
|
|
447
|
+
`--remote-debugging-port=${port}`,
|
|
448
|
+
`--user-data-dir=${targetDir}`
|
|
449
|
+
],
|
|
450
|
+
{ detached: true, stdio: "ignore" }
|
|
451
|
+
);
|
|
452
|
+
this.chromeProcess.unref();
|
|
453
|
+
for (let i = 0; i < 30; i++) {
|
|
454
|
+
try {
|
|
455
|
+
const res = await fetch(`http://localhost:${port}/json/version`);
|
|
456
|
+
if (res.ok) return;
|
|
457
|
+
} catch {
|
|
458
|
+
}
|
|
459
|
+
await new Promise((r) => setTimeout(r, 500));
|
|
460
|
+
}
|
|
461
|
+
throw new Error("Chrome launched but CDP endpoint did not become available within 15s");
|
|
462
|
+
}
|
|
249
463
|
async getPage() {
|
|
250
464
|
await this.connect();
|
|
251
465
|
return this._page;
|
|
@@ -380,11 +594,69 @@ var ToolRegistry = class {
|
|
|
380
594
|
|
|
381
595
|
// src/tools/windows/screenshot.ts
|
|
382
596
|
import { z } from "zod";
|
|
597
|
+
import sharp2 from "sharp";
|
|
598
|
+
|
|
599
|
+
// src/tools/windows/grid-overlay.ts
|
|
600
|
+
import sharp from "sharp";
|
|
601
|
+
async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
|
|
602
|
+
const gridSpacing = options.gridSpacing ?? 100;
|
|
603
|
+
const labelSpacing = options.labelSpacing ?? 200;
|
|
604
|
+
const majorSpacing = gridSpacing * 5;
|
|
605
|
+
const svgParts = [];
|
|
606
|
+
for (let x = gridSpacing; x < width; x += gridSpacing) {
|
|
607
|
+
const isMajor = x % majorSpacing === 0;
|
|
608
|
+
const opacity = isMajor ? 0.35 : 0.15;
|
|
609
|
+
const sw = isMajor ? 1.5 : 0.5;
|
|
610
|
+
svgParts.push(
|
|
611
|
+
`<line x1="${x}" y1="0" x2="${x}" y2="${height}" stroke="rgba(255,50,50,${opacity})" stroke-width="${sw}"/>`
|
|
612
|
+
);
|
|
613
|
+
}
|
|
614
|
+
for (let y = gridSpacing; y < height; y += gridSpacing) {
|
|
615
|
+
const isMajor = y % majorSpacing === 0;
|
|
616
|
+
const opacity = isMajor ? 0.35 : 0.15;
|
|
617
|
+
const sw = isMajor ? 1.5 : 0.5;
|
|
618
|
+
svgParts.push(
|
|
619
|
+
`<line x1="0" y1="${y}" x2="${width}" y2="${y}" stroke="rgba(255,50,50,${opacity})" stroke-width="${sw}"/>`
|
|
620
|
+
);
|
|
621
|
+
}
|
|
622
|
+
for (let x = labelSpacing; x < width; x += labelSpacing) {
|
|
623
|
+
const text = String(x);
|
|
624
|
+
const tw = text.length * 7.5 + 6;
|
|
625
|
+
svgParts.push(
|
|
626
|
+
`<rect x="${x - tw / 2}" y="2" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
|
|
627
|
+
`<text x="${x}" y="14" text-anchor="middle" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${text}</text>`
|
|
628
|
+
);
|
|
629
|
+
}
|
|
630
|
+
for (let y = labelSpacing; y < height; y += labelSpacing) {
|
|
631
|
+
const text = String(y);
|
|
632
|
+
const tw = text.length * 7.5 + 6;
|
|
633
|
+
svgParts.push(
|
|
634
|
+
`<rect x="2" y="${y - 8}" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
|
|
635
|
+
`<text x="5" y="${y + 4}" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${text}</text>`
|
|
636
|
+
);
|
|
637
|
+
}
|
|
638
|
+
svgParts.push(
|
|
639
|
+
`<rect x="2" y="2" width="22" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
|
|
640
|
+
`<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">0,0</text>`
|
|
641
|
+
);
|
|
642
|
+
const dimText = `${width}x${height}`;
|
|
643
|
+
const dimTw = dimText.length * 7.5 + 6;
|
|
644
|
+
svgParts.push(
|
|
645
|
+
`<rect x="${width - dimTw - 2}" y="${height - 18}" width="${dimTw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
|
|
646
|
+
`<text x="${width - dimTw / 2 - 2}" y="${height - 6}" text-anchor="middle" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${dimText}</text>`
|
|
647
|
+
);
|
|
648
|
+
const svg = Buffer.from(
|
|
649
|
+
`<svg width="${width}" height="${height}" xmlns="http://www.w3.org/2000/svg">${svgParts.join("")}</svg>`
|
|
650
|
+
);
|
|
651
|
+
return sharp(imageBuffer).composite([{ input: svg, top: 0, left: 0 }]).jpeg({ quality: 70 }).toBuffer();
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
// src/tools/windows/screenshot.ts
|
|
383
655
|
var screenshotTool = {
|
|
384
656
|
name: "screenshot",
|
|
385
|
-
description: "Capture the full screen
|
|
657
|
+
description: "Capture the full screen with a coordinate grid overlay. The grid shows pixel coordinates that match mouse_click/mouse_move coordinates. Returns a screenshot ID.",
|
|
386
658
|
parameters: z.object({}),
|
|
387
|
-
async execute() {
|
|
659
|
+
async execute(_args, ctx) {
|
|
388
660
|
const { Monitor } = await import("node-screenshots");
|
|
389
661
|
const monitors = Monitor.all();
|
|
390
662
|
const primary = monitors.find((m) => m.isPrimary()) ?? monitors[0];
|
|
@@ -392,11 +664,24 @@ var screenshotTool = {
|
|
|
392
664
|
return { type: "text", content: "Error: No monitor found" };
|
|
393
665
|
}
|
|
394
666
|
const image = primary.captureImageSync();
|
|
395
|
-
const
|
|
667
|
+
const physW = image.width;
|
|
668
|
+
const physH = image.height;
|
|
669
|
+
const scaleFactor = primary.scaleFactor ?? 1;
|
|
670
|
+
const logicalW = Math.round(physW / scaleFactor);
|
|
671
|
+
const logicalH = Math.round(physH / scaleFactor);
|
|
672
|
+
const raw = image.toRawSync();
|
|
673
|
+
const resized = await sharp2(raw, {
|
|
674
|
+
raw: { width: physW, height: physH, channels: 4 }
|
|
675
|
+
}).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
|
|
676
|
+
const cleanBase64 = resized.toString("base64");
|
|
677
|
+
const id = ctx.screenshots.save(cleanBase64, "image/jpeg", "desktop");
|
|
678
|
+
const gridImage = await addCoordinateGrid(resized, logicalW, logicalH);
|
|
679
|
+
const gridBase64 = gridImage.toString("base64");
|
|
396
680
|
return {
|
|
397
681
|
type: "image",
|
|
398
|
-
base64:
|
|
399
|
-
mimeType: "image/
|
|
682
|
+
base64: gridBase64,
|
|
683
|
+
mimeType: "image/jpeg",
|
|
684
|
+
screenshotId: id
|
|
400
685
|
};
|
|
401
686
|
}
|
|
402
687
|
};
|
|
@@ -631,13 +916,46 @@ var fileWriteTool = {
|
|
|
631
916
|
}
|
|
632
917
|
};
|
|
633
918
|
|
|
634
|
-
// src/tools/
|
|
919
|
+
// src/tools/file/image.ts
|
|
635
920
|
import { z as z7 } from "zod";
|
|
921
|
+
import { readFileSync, existsSync as existsSync2 } from "fs";
|
|
922
|
+
import { extname } from "path";
|
|
923
|
+
var IMAGE_EXTS = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".bmp", ".webp"]);
|
|
924
|
+
var useLocalImageTool = {
|
|
925
|
+
name: "use_local_image",
|
|
926
|
+
description: "Load a local image file and get a screenshot ID for it. Use this to reference local images in your report via [Image:img_X].",
|
|
927
|
+
parameters: z7.object({
|
|
928
|
+
path: z7.string().describe("Absolute path to the image file"),
|
|
929
|
+
label: z7.string().default("local").describe('Label for the image (e.g. "chart", "photo")')
|
|
930
|
+
}),
|
|
931
|
+
async execute(args, ctx) {
|
|
932
|
+
if (!existsSync2(args.path)) {
|
|
933
|
+
return { type: "text", content: `Error: File not found: ${args.path}` };
|
|
934
|
+
}
|
|
935
|
+
const ext = extname(args.path).toLowerCase();
|
|
936
|
+
if (!IMAGE_EXTS.has(ext)) {
|
|
937
|
+
return { type: "text", content: `Error: Not a supported image format (${ext}). Supported: ${[...IMAGE_EXTS].join(", ")}` };
|
|
938
|
+
}
|
|
939
|
+
const buf = readFileSync(args.path);
|
|
940
|
+
const mimeType = ext === ".png" ? "image/png" : "image/jpeg";
|
|
941
|
+
const base64 = buf.toString("base64");
|
|
942
|
+
const id = ctx.screenshots.save(base64, mimeType, args.label);
|
|
943
|
+
return {
|
|
944
|
+
type: "image",
|
|
945
|
+
base64,
|
|
946
|
+
mimeType,
|
|
947
|
+
screenshotId: id
|
|
948
|
+
};
|
|
949
|
+
}
|
|
950
|
+
};
|
|
951
|
+
|
|
952
|
+
// src/tools/browser/navigate.ts
|
|
953
|
+
import { z as z8 } from "zod";
|
|
636
954
|
var browserNavigateTool = {
|
|
637
955
|
name: "browser_navigate",
|
|
638
956
|
description: "Navigate the browser to a URL.",
|
|
639
|
-
parameters:
|
|
640
|
-
url:
|
|
957
|
+
parameters: z8.object({
|
|
958
|
+
url: z8.string().describe("The URL to navigate to")
|
|
641
959
|
}),
|
|
642
960
|
async execute(args, ctx) {
|
|
643
961
|
const browser = await ctx.getBrowser();
|
|
@@ -650,12 +968,12 @@ Page title: ${title}` };
|
|
|
650
968
|
};
|
|
651
969
|
|
|
652
970
|
// src/tools/browser/click.ts
|
|
653
|
-
import { z as
|
|
971
|
+
import { z as z9 } from "zod";
|
|
654
972
|
var browserClickTool = {
|
|
655
973
|
name: "browser_click",
|
|
656
974
|
description: "Click an element on the web page using a CSS selector or text content.",
|
|
657
|
-
parameters:
|
|
658
|
-
selector:
|
|
975
|
+
parameters: z9.object({
|
|
976
|
+
selector: z9.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
|
|
659
977
|
}),
|
|
660
978
|
async execute(args, ctx) {
|
|
661
979
|
const browser = await ctx.getBrowser();
|
|
@@ -666,14 +984,14 @@ var browserClickTool = {
|
|
|
666
984
|
};
|
|
667
985
|
|
|
668
986
|
// src/tools/browser/type.ts
|
|
669
|
-
import { z as
|
|
987
|
+
import { z as z10 } from "zod";
|
|
670
988
|
var browserTypeTool = {
|
|
671
989
|
name: "browser_type",
|
|
672
990
|
description: "Type text into an input field on the web page.",
|
|
673
|
-
parameters:
|
|
674
|
-
selector:
|
|
675
|
-
text:
|
|
676
|
-
clear:
|
|
991
|
+
parameters: z10.object({
|
|
992
|
+
selector: z10.string().describe("CSS selector for the input element"),
|
|
993
|
+
text: z10.string().describe("Text to type"),
|
|
994
|
+
clear: z10.boolean().default(true).describe("Whether to clear the field before typing")
|
|
677
995
|
}),
|
|
678
996
|
async execute(args, ctx) {
|
|
679
997
|
const browser = await ctx.getBrowser();
|
|
@@ -688,35 +1006,40 @@ var browserTypeTool = {
|
|
|
688
1006
|
};
|
|
689
1007
|
|
|
690
1008
|
// src/tools/browser/screenshot.ts
|
|
691
|
-
import { z as
|
|
1009
|
+
import { z as z11 } from "zod";
|
|
692
1010
|
var browserScreenshotTool = {
|
|
693
1011
|
name: "browser_screenshot",
|
|
694
|
-
description: "Take a screenshot of the current browser page.",
|
|
695
|
-
parameters:
|
|
696
|
-
fullPage:
|
|
1012
|
+
description: "Take a screenshot of the current browser page. Returns a screenshot ID (e.g. img_2) that you can reference later in report.",
|
|
1013
|
+
parameters: z11.object({
|
|
1014
|
+
fullPage: z11.boolean().default(false).describe("Whether to capture the full scrollable page")
|
|
697
1015
|
}),
|
|
698
1016
|
async execute(args, ctx) {
|
|
699
1017
|
const browser = await ctx.getBrowser();
|
|
700
1018
|
const page = await browser.getPage();
|
|
701
1019
|
const buf = await page.screenshot({
|
|
702
|
-
type: "
|
|
703
|
-
|
|
1020
|
+
type: "jpeg",
|
|
1021
|
+
quality: 70,
|
|
1022
|
+
fullPage: args.fullPage,
|
|
1023
|
+
scale: "css"
|
|
704
1024
|
});
|
|
1025
|
+
const base64 = buf.toString("base64");
|
|
1026
|
+
const id = ctx.screenshots.save(base64, "image/jpeg", "browser");
|
|
705
1027
|
return {
|
|
706
1028
|
type: "image",
|
|
707
|
-
base64
|
|
708
|
-
mimeType: "image/
|
|
1029
|
+
base64,
|
|
1030
|
+
mimeType: "image/jpeg",
|
|
1031
|
+
screenshotId: id
|
|
709
1032
|
};
|
|
710
1033
|
}
|
|
711
1034
|
};
|
|
712
1035
|
|
|
713
1036
|
// src/tools/browser/content.ts
|
|
714
|
-
import { z as
|
|
1037
|
+
import { z as z12 } from "zod";
|
|
715
1038
|
var MAX_CONTENT_LENGTH = 2e4;
|
|
716
1039
|
var browserContentTool = {
|
|
717
1040
|
name: "browser_content",
|
|
718
1041
|
description: "Get the text content of the current web page. Returns visible text, not HTML.",
|
|
719
|
-
parameters:
|
|
1042
|
+
parameters: z12.object({}),
|
|
720
1043
|
async execute(_args, ctx) {
|
|
721
1044
|
const browser = await ctx.getBrowser();
|
|
722
1045
|
const page = await browser.getPage();
|
|
@@ -737,13 +1060,13 @@ ${text}`
|
|
|
737
1060
|
};
|
|
738
1061
|
|
|
739
1062
|
// src/tools/browser/scroll.ts
|
|
740
|
-
import { z as
|
|
1063
|
+
import { z as z13 } from "zod";
|
|
741
1064
|
var browserScrollTool = {
|
|
742
1065
|
name: "browser_scroll",
|
|
743
1066
|
description: "Scroll the current web page.",
|
|
744
|
-
parameters:
|
|
745
|
-
direction:
|
|
746
|
-
amount:
|
|
1067
|
+
parameters: z13.object({
|
|
1068
|
+
direction: z13.enum(["up", "down"]).describe("Scroll direction"),
|
|
1069
|
+
amount: z13.number().positive().default(500).describe("Pixels to scroll")
|
|
747
1070
|
}),
|
|
748
1071
|
async execute(args, ctx) {
|
|
749
1072
|
const browser = await ctx.getBrowser();
|
|
@@ -755,38 +1078,22 @@ var browserScrollTool = {
|
|
|
755
1078
|
};
|
|
756
1079
|
|
|
757
1080
|
// src/tools/control/report.ts
|
|
758
|
-
import { z as
|
|
1081
|
+
import { z as z14 } from "zod";
|
|
759
1082
|
var reportTool = {
|
|
760
1083
|
name: "report",
|
|
761
|
-
description:
|
|
762
|
-
parameters:
|
|
763
|
-
status:
|
|
1084
|
+
description: 'Report progress back to the caller. Call this when the task is completed, when you are blocked, or when you need guidance. Calling this STOPS your execution immediately.\n\nThe content field supports rich document format: mix text with screenshots using [Image:img_1] markers. Example:\n"Here is the current state:\n[Image:img_2]\nThe page shows..."',
|
|
1085
|
+
parameters: z14.object({
|
|
1086
|
+
status: z14.enum(["completed", "blocked", "need_guidance"]).describe(
|
|
764
1087
|
'"completed" = task done, "blocked" = cannot proceed, "need_guidance" = need a decision'
|
|
765
1088
|
),
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
data: z13.unknown().optional().describe("Optional structured data to return")
|
|
1089
|
+
content: z14.string().describe('Rich report content. Use [Image:img_X] to embed screenshots captured earlier. Example: "Task done.\\n[Image:img_1]\\nThe page shows the result."'),
|
|
1090
|
+
data: z14.unknown().optional().describe("Optional structured data to return")
|
|
769
1091
|
}),
|
|
770
1092
|
async execute(args) {
|
|
771
|
-
let screenshot;
|
|
772
|
-
if (args.include_screenshot) {
|
|
773
|
-
try {
|
|
774
|
-
const { Monitor } = await import("node-screenshots");
|
|
775
|
-
const monitors = Monitor.all();
|
|
776
|
-
const primary = monitors.find((m) => m.isPrimary()) ?? monitors[0];
|
|
777
|
-
if (primary) {
|
|
778
|
-
const image = primary.captureImageSync();
|
|
779
|
-
const buf = image.toPngSync();
|
|
780
|
-
screenshot = buf.toString("base64");
|
|
781
|
-
}
|
|
782
|
-
} catch {
|
|
783
|
-
}
|
|
784
|
-
}
|
|
785
1093
|
return {
|
|
786
1094
|
type: "report",
|
|
787
1095
|
status: args.status,
|
|
788
|
-
|
|
789
|
-
screenshot,
|
|
1096
|
+
content: args.content,
|
|
790
1097
|
data: args.data
|
|
791
1098
|
};
|
|
792
1099
|
}
|
|
@@ -804,6 +1111,7 @@ function createToolRegistry() {
|
|
|
804
1111
|
registry2.register(runCommandTool);
|
|
805
1112
|
registry2.register(fileReadTool);
|
|
806
1113
|
registry2.register(fileWriteTool);
|
|
1114
|
+
registry2.register(useLocalImageTool);
|
|
807
1115
|
registry2.register(browserNavigateTool);
|
|
808
1116
|
registry2.register(browserClickTool);
|
|
809
1117
|
registry2.register(browserTypeTool);
|
|
@@ -814,21 +1122,70 @@ function createToolRegistry() {
|
|
|
814
1122
|
return registry2;
|
|
815
1123
|
}
|
|
816
1124
|
|
|
1125
|
+
// src/tools/types.ts
|
|
1126
|
+
var ScreenshotStore = class {
|
|
1127
|
+
counter = 0;
|
|
1128
|
+
store = /* @__PURE__ */ new Map();
|
|
1129
|
+
save(base64, mimeType, label) {
|
|
1130
|
+
this.counter++;
|
|
1131
|
+
const id = `img_${this.counter}`;
|
|
1132
|
+
this.store.set(id, { id, base64, mimeType, label });
|
|
1133
|
+
return id;
|
|
1134
|
+
}
|
|
1135
|
+
get(id) {
|
|
1136
|
+
return this.store.get(id);
|
|
1137
|
+
}
|
|
1138
|
+
listIds() {
|
|
1139
|
+
return [...this.store.keys()];
|
|
1140
|
+
}
|
|
1141
|
+
};
|
|
1142
|
+
function parseReportContent(content, store) {
|
|
1143
|
+
const blocks = [];
|
|
1144
|
+
const regex = /\[Image:(img_\d+)\]/g;
|
|
1145
|
+
let lastIndex = 0;
|
|
1146
|
+
let match;
|
|
1147
|
+
while ((match = regex.exec(content)) !== null) {
|
|
1148
|
+
if (match.index > lastIndex) {
|
|
1149
|
+
blocks.push({ type: "text", text: content.slice(lastIndex, match.index) });
|
|
1150
|
+
}
|
|
1151
|
+
const id = match[1];
|
|
1152
|
+
const screenshot = store.get(id);
|
|
1153
|
+
if (screenshot) {
|
|
1154
|
+
blocks.push({
|
|
1155
|
+
type: "image",
|
|
1156
|
+
id: screenshot.id,
|
|
1157
|
+
base64: screenshot.base64,
|
|
1158
|
+
mimeType: screenshot.mimeType,
|
|
1159
|
+
label: screenshot.label
|
|
1160
|
+
});
|
|
1161
|
+
} else {
|
|
1162
|
+
blocks.push({ type: "text", text: match[0] });
|
|
1163
|
+
}
|
|
1164
|
+
lastIndex = regex.lastIndex;
|
|
1165
|
+
}
|
|
1166
|
+
if (lastIndex < content.length) {
|
|
1167
|
+
blocks.push({ type: "text", text: content.slice(lastIndex) });
|
|
1168
|
+
}
|
|
1169
|
+
return blocks;
|
|
1170
|
+
}
|
|
1171
|
+
|
|
817
1172
|
// src/mcp/session-registry.ts
|
|
818
1173
|
var SessionRegistry = class {
|
|
819
1174
|
sessions = /* @__PURE__ */ new Map();
|
|
820
1175
|
create(config) {
|
|
821
1176
|
const id = crypto.randomUUID();
|
|
822
|
-
const contextManager = new ContextManager(
|
|
1177
|
+
const contextManager = new ContextManager();
|
|
823
1178
|
const llmClient = new LLMClient(config);
|
|
824
1179
|
const browserClient = new BrowserClient(config.cdpUrl);
|
|
825
1180
|
const toolRegistry = createToolRegistry();
|
|
1181
|
+
const screenshotStore = new ScreenshotStore();
|
|
826
1182
|
const toolContext = {
|
|
827
1183
|
sessionId: id,
|
|
828
1184
|
cdpUrl: config.cdpUrl,
|
|
829
1185
|
getBrowser: () => {
|
|
830
1186
|
return browserClient.connect().then(() => browserClient);
|
|
831
|
-
}
|
|
1187
|
+
},
|
|
1188
|
+
screenshots: screenshotStore
|
|
832
1189
|
};
|
|
833
1190
|
const runner = new AgentRunner(
|
|
834
1191
|
llmClient,
|
|
@@ -848,6 +1205,7 @@ var SessionRegistry = class {
|
|
|
848
1205
|
config,
|
|
849
1206
|
runner,
|
|
850
1207
|
browserClient,
|
|
1208
|
+
screenshots: screenshotStore,
|
|
851
1209
|
timeoutHandle
|
|
852
1210
|
};
|
|
853
1211
|
this.sessions.set(id, session);
|
|
@@ -882,30 +1240,45 @@ var SessionRegistry = class {
|
|
|
882
1240
|
};
|
|
883
1241
|
|
|
884
1242
|
// src/mcp/tools.ts
|
|
885
|
-
import { z as
|
|
1243
|
+
import { z as z16 } from "zod";
|
|
1244
|
+
|
|
1245
|
+
// src/config/loader.ts
|
|
1246
|
+
import { readFileSync as readFileSync2, existsSync as existsSync3 } from "fs";
|
|
1247
|
+
import { join as join2 } from "path";
|
|
1248
|
+
import { homedir as homedir2 } from "os";
|
|
886
1249
|
|
|
887
1250
|
// src/config/schema.ts
|
|
888
|
-
import { z as
|
|
889
|
-
var ConfigSchema =
|
|
890
|
-
apiKey:
|
|
891
|
-
baseURL:
|
|
892
|
-
model:
|
|
893
|
-
maxSteps:
|
|
894
|
-
|
|
895
|
-
cdpUrl:
|
|
896
|
-
timeoutMs:
|
|
1251
|
+
import { z as z15 } from "zod";
|
|
1252
|
+
var ConfigSchema = z15.object({
|
|
1253
|
+
apiKey: z15.string().min(1, "API key is required"),
|
|
1254
|
+
baseURL: z15.string().url("Must be a valid URL"),
|
|
1255
|
+
model: z15.string().min(1, "Model name is required"),
|
|
1256
|
+
maxSteps: z15.number().int().positive().default(50),
|
|
1257
|
+
maxRounds: z15.number().int().positive().default(20),
|
|
1258
|
+
cdpUrl: z15.string().default("http://localhost:9222"),
|
|
1259
|
+
timeoutMs: z15.number().default(3e5)
|
|
897
1260
|
});
|
|
898
1261
|
|
|
899
1262
|
// src/config/loader.ts
|
|
1263
|
+
var CONFIG_FILE = join2(homedir2(), ".windows-use.json");
|
|
1264
|
+
function loadFileConfig() {
|
|
1265
|
+
if (!existsSync3(CONFIG_FILE)) return {};
|
|
1266
|
+
try {
|
|
1267
|
+
return JSON.parse(readFileSync2(CONFIG_FILE, "utf-8"));
|
|
1268
|
+
} catch {
|
|
1269
|
+
return {};
|
|
1270
|
+
}
|
|
1271
|
+
}
|
|
900
1272
|
function loadConfig(overrides) {
|
|
1273
|
+
const file = loadFileConfig();
|
|
901
1274
|
const raw = {
|
|
902
|
-
apiKey: overrides?.apiKey ?? process.env.WINDOWS_USE_API_KEY ?? "",
|
|
903
|
-
baseURL: overrides?.baseURL ?? process.env.WINDOWS_USE_BASE_URL ?? "",
|
|
904
|
-
model: overrides?.model ?? process.env.WINDOWS_USE_MODEL ?? "",
|
|
905
|
-
maxSteps: overrides?.maxSteps ?? intEnv("WINDOWS_USE_MAX_STEPS") ?? 50,
|
|
906
|
-
|
|
907
|
-
cdpUrl: overrides?.cdpUrl ?? process.env.WINDOWS_USE_CDP_URL ?? "http://localhost:9222",
|
|
908
|
-
timeoutMs: overrides?.timeoutMs ?? intEnv("WINDOWS_USE_TIMEOUT_MS") ?? 3e5
|
|
1275
|
+
apiKey: overrides?.apiKey ?? process.env.WINDOWS_USE_API_KEY ?? file.apiKey ?? "",
|
|
1276
|
+
baseURL: overrides?.baseURL ?? process.env.WINDOWS_USE_BASE_URL ?? file.baseURL ?? "",
|
|
1277
|
+
model: overrides?.model ?? process.env.WINDOWS_USE_MODEL ?? file.model ?? "",
|
|
1278
|
+
maxSteps: overrides?.maxSteps ?? intEnv("WINDOWS_USE_MAX_STEPS") ?? file.maxSteps ?? 50,
|
|
1279
|
+
maxRounds: overrides?.maxRounds ?? intEnv("WINDOWS_USE_MAX_ROUNDS") ?? file.maxRounds ?? 20,
|
|
1280
|
+
cdpUrl: overrides?.cdpUrl ?? process.env.WINDOWS_USE_CDP_URL ?? file.cdpUrl ?? "http://localhost:9222",
|
|
1281
|
+
timeoutMs: overrides?.timeoutMs ?? intEnv("WINDOWS_USE_TIMEOUT_MS") ?? file.timeoutMs ?? 3e5
|
|
909
1282
|
};
|
|
910
1283
|
return ConfigSchema.parse(raw);
|
|
911
1284
|
}
|
|
@@ -922,12 +1295,13 @@ function registerMcpTools(server2, registry2) {
|
|
|
922
1295
|
"create_session",
|
|
923
1296
|
"Create a new automation session with a small LLM agent. Returns a session_id.",
|
|
924
1297
|
{
|
|
925
|
-
api_key:
|
|
926
|
-
base_url:
|
|
927
|
-
model:
|
|
928
|
-
cdp_url:
|
|
929
|
-
timeout_ms:
|
|
930
|
-
max_steps:
|
|
1298
|
+
api_key: z16.string().optional().describe("LLM API key (or set WINDOWS_USE_API_KEY env)"),
|
|
1299
|
+
base_url: z16.string().optional().describe("OpenAI-compatible base URL (or set WINDOWS_USE_BASE_URL env)"),
|
|
1300
|
+
model: z16.string().optional().describe("Model name (or set WINDOWS_USE_MODEL env)"),
|
|
1301
|
+
cdp_url: z16.string().optional().describe("Chrome CDP URL (default: http://localhost:9222)"),
|
|
1302
|
+
timeout_ms: z16.number().optional().describe("Session inactivity timeout in ms (default: 300000)"),
|
|
1303
|
+
max_steps: z16.number().optional().describe("Max tool-calling steps per instruction (default: 50)"),
|
|
1304
|
+
max_rounds: z16.number().optional().describe("Max instruction rounds per session (default: 20)")
|
|
931
1305
|
},
|
|
932
1306
|
async (args) => {
|
|
933
1307
|
const config = loadConfig({
|
|
@@ -936,7 +1310,8 @@ function registerMcpTools(server2, registry2) {
|
|
|
936
1310
|
model: args.model,
|
|
937
1311
|
cdpUrl: args.cdp_url,
|
|
938
1312
|
timeoutMs: args.timeout_ms,
|
|
939
|
-
maxSteps: args.max_steps
|
|
1313
|
+
maxSteps: args.max_steps,
|
|
1314
|
+
maxRounds: args.max_rounds
|
|
940
1315
|
});
|
|
941
1316
|
const session = registry2.create(config);
|
|
942
1317
|
return {
|
|
@@ -951,10 +1326,10 @@ function registerMcpTools(server2, registry2) {
|
|
|
951
1326
|
);
|
|
952
1327
|
server2.tool(
|
|
953
1328
|
"send_instruction",
|
|
954
|
-
"Send a task instruction to the agent in a session. The agent executes it and returns a
|
|
1329
|
+
"Send a task instruction to the agent in a session. The agent executes it and returns a rich report with text and images.",
|
|
955
1330
|
{
|
|
956
|
-
session_id:
|
|
957
|
-
instruction:
|
|
1331
|
+
session_id: z16.string().describe("Session ID from create_session"),
|
|
1332
|
+
instruction: z16.string().describe("What you want the agent to do, in natural language")
|
|
958
1333
|
},
|
|
959
1334
|
async (args) => {
|
|
960
1335
|
const session = registry2.get(args.session_id);
|
|
@@ -974,32 +1349,37 @@ function registerMcpTools(server2, registry2) {
|
|
|
974
1349
|
registry2.touch(args.session_id);
|
|
975
1350
|
const result = await session.runner.run(args.instruction);
|
|
976
1351
|
registry2.touch(args.session_id);
|
|
977
|
-
const
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
}
|
|
1352
|
+
const mcpContent = [];
|
|
1353
|
+
mcpContent.push({
|
|
1354
|
+
type: "text",
|
|
1355
|
+
text: JSON.stringify({
|
|
1356
|
+
status: result.status,
|
|
1357
|
+
steps_used: result.stepsUsed,
|
|
1358
|
+
round: session.runner.currentRound,
|
|
1359
|
+
rounds_remaining: session.config.maxRounds - session.runner.currentRound,
|
|
1360
|
+
...result.data !== void 0 ? { data: result.data } : {}
|
|
1361
|
+
})
|
|
1362
|
+
});
|
|
1363
|
+
const blocks = parseReportContent(result.content, session.screenshots);
|
|
1364
|
+
for (const block of blocks) {
|
|
1365
|
+
if (block.type === "text") {
|
|
1366
|
+
mcpContent.push({ type: "text", text: block.text });
|
|
1367
|
+
} else {
|
|
1368
|
+
mcpContent.push({
|
|
1369
|
+
type: "image",
|
|
1370
|
+
data: block.base64,
|
|
1371
|
+
mimeType: block.mimeType
|
|
1372
|
+
});
|
|
986
1373
|
}
|
|
987
|
-
];
|
|
988
|
-
if (result.screenshot) {
|
|
989
|
-
content.push({
|
|
990
|
-
type: "image",
|
|
991
|
-
data: result.screenshot,
|
|
992
|
-
mimeType: "image/png"
|
|
993
|
-
});
|
|
994
1374
|
}
|
|
995
|
-
return { content };
|
|
1375
|
+
return { content: mcpContent };
|
|
996
1376
|
}
|
|
997
1377
|
);
|
|
998
1378
|
server2.tool(
|
|
999
1379
|
"done_session",
|
|
1000
1380
|
"Terminate a session and free all resources.",
|
|
1001
1381
|
{
|
|
1002
|
-
session_id:
|
|
1382
|
+
session_id: z16.string().describe("Session ID to terminate")
|
|
1003
1383
|
},
|
|
1004
1384
|
async (args) => {
|
|
1005
1385
|
await registry2.destroy(args.session_id);
|