windows-use 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +188 -70
- package/dist/cli.js +662 -129
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +88 -16
- package/dist/index.js +467 -88
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +504 -124
- package/dist/mcp/server.js.map +1 -1
- package/package.json +10 -1
package/dist/cli.js
CHANGED
|
@@ -15,7 +15,7 @@ var init_schema = __esm({
|
|
|
15
15
|
baseURL: z.string().url("Must be a valid URL"),
|
|
16
16
|
model: z.string().min(1, "Model name is required"),
|
|
17
17
|
maxSteps: z.number().int().positive().default(50),
|
|
18
|
-
|
|
18
|
+
maxRounds: z.number().int().positive().default(20),
|
|
19
19
|
cdpUrl: z.string().default("http://localhost:9222"),
|
|
20
20
|
timeoutMs: z.number().default(3e5)
|
|
21
21
|
});
|
|
@@ -23,15 +23,30 @@ var init_schema = __esm({
|
|
|
23
23
|
});
|
|
24
24
|
|
|
25
25
|
// src/config/loader.ts
|
|
26
|
+
import { readFileSync, existsSync } from "fs";
|
|
27
|
+
import { join } from "path";
|
|
28
|
+
import { homedir } from "os";
|
|
29
|
+
function loadFileConfig() {
|
|
30
|
+
if (!existsSync(CONFIG_FILE)) return {};
|
|
31
|
+
try {
|
|
32
|
+
return JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
|
|
33
|
+
} catch {
|
|
34
|
+
return {};
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
function getConfigPath() {
|
|
38
|
+
return CONFIG_FILE;
|
|
39
|
+
}
|
|
26
40
|
function loadConfig(overrides) {
|
|
41
|
+
const file = loadFileConfig();
|
|
27
42
|
const raw = {
|
|
28
|
-
apiKey: overrides?.apiKey ?? process.env.WINDOWS_USE_API_KEY ?? "",
|
|
29
|
-
baseURL: overrides?.baseURL ?? process.env.WINDOWS_USE_BASE_URL ?? "",
|
|
30
|
-
model: overrides?.model ?? process.env.WINDOWS_USE_MODEL ?? "",
|
|
31
|
-
maxSteps: overrides?.maxSteps ?? intEnv("WINDOWS_USE_MAX_STEPS") ?? 50,
|
|
32
|
-
|
|
33
|
-
cdpUrl: overrides?.cdpUrl ?? process.env.WINDOWS_USE_CDP_URL ?? "http://localhost:9222",
|
|
34
|
-
timeoutMs: overrides?.timeoutMs ?? intEnv("WINDOWS_USE_TIMEOUT_MS") ?? 3e5
|
|
43
|
+
apiKey: overrides?.apiKey ?? process.env.WINDOWS_USE_API_KEY ?? file.apiKey ?? "",
|
|
44
|
+
baseURL: overrides?.baseURL ?? process.env.WINDOWS_USE_BASE_URL ?? file.baseURL ?? "",
|
|
45
|
+
model: overrides?.model ?? process.env.WINDOWS_USE_MODEL ?? file.model ?? "",
|
|
46
|
+
maxSteps: overrides?.maxSteps ?? intEnv("WINDOWS_USE_MAX_STEPS") ?? file.maxSteps ?? 50,
|
|
47
|
+
maxRounds: overrides?.maxRounds ?? intEnv("WINDOWS_USE_MAX_ROUNDS") ?? file.maxRounds ?? 20,
|
|
48
|
+
cdpUrl: overrides?.cdpUrl ?? process.env.WINDOWS_USE_CDP_URL ?? file.cdpUrl ?? "http://localhost:9222",
|
|
49
|
+
timeoutMs: overrides?.timeoutMs ?? intEnv("WINDOWS_USE_TIMEOUT_MS") ?? file.timeoutMs ?? 3e5
|
|
35
50
|
};
|
|
36
51
|
return ConfigSchema.parse(raw);
|
|
37
52
|
}
|
|
@@ -41,10 +56,12 @@ function intEnv(name) {
|
|
|
41
56
|
const n = parseInt(val, 10);
|
|
42
57
|
return isNaN(n) ? void 0 : n;
|
|
43
58
|
}
|
|
59
|
+
var CONFIG_FILE;
|
|
44
60
|
var init_loader = __esm({
|
|
45
61
|
"src/config/loader.ts"() {
|
|
46
62
|
"use strict";
|
|
47
63
|
init_schema();
|
|
64
|
+
CONFIG_FILE = join(homedir(), ".windows-use.json");
|
|
48
65
|
}
|
|
49
66
|
});
|
|
50
67
|
|
|
@@ -55,23 +72,14 @@ var init_context_manager = __esm({
|
|
|
55
72
|
"use strict";
|
|
56
73
|
ContextManager = class {
|
|
57
74
|
messages = [];
|
|
58
|
-
maxMessages;
|
|
59
|
-
constructor(maxMessages) {
|
|
60
|
-
this.maxMessages = maxMessages;
|
|
61
|
-
}
|
|
62
75
|
append(message) {
|
|
63
76
|
this.messages.push(message);
|
|
64
77
|
}
|
|
65
|
-
/** Returns
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
const systemPrompt = this.messages[0]?.role === "system" ? this.messages[0] : null;
|
|
69
|
-
const nonSystem = systemPrompt ? this.messages.slice(1) : this.messages;
|
|
70
|
-
const windowSize = this.maxMessages - (systemPrompt ? 1 : 0);
|
|
71
|
-
const windowed = nonSystem.length > windowSize ? nonSystem.slice(-windowSize) : nonSystem;
|
|
72
|
-
return systemPrompt ? [systemPrompt, ...windowed] : windowed;
|
|
78
|
+
/** Returns all messages. */
|
|
79
|
+
getMessages() {
|
|
80
|
+
return [...this.messages];
|
|
73
81
|
}
|
|
74
|
-
/** Total messages stored
|
|
82
|
+
/** Total messages stored. */
|
|
75
83
|
get length() {
|
|
76
84
|
return this.messages.length;
|
|
77
85
|
}
|
|
@@ -133,7 +141,16 @@ Call \`report\` when:
|
|
|
133
141
|
- **"blocked"**: You cannot proceed (CAPTCHA, login wall, unexpected error). Explain what's blocking you.
|
|
134
142
|
- **"need_guidance"**: You need a decision or clarification. Describe what you need.
|
|
135
143
|
|
|
136
|
-
Calling \`report\` stops your execution.
|
|
144
|
+
Calling \`report\` stops your execution. The \`content\` field supports a rich document format \u2014 mix text with screenshots using \`[Image:img_X]\` markers:
|
|
145
|
+
|
|
146
|
+
\`\`\`
|
|
147
|
+
report({
|
|
148
|
+
status: "completed",
|
|
149
|
+
content: "Here is what I found:\\n[Image:img_2]\\nThe page shows the search results.\\n[Image:img_3]\\nI also checked the sidebar."
|
|
150
|
+
})
|
|
151
|
+
\`\`\`
|
|
152
|
+
|
|
153
|
+
Each screenshot tool returns a screenshot ID (e.g. img_1, img_2). Use these IDs to embed images in your report.
|
|
137
154
|
|
|
138
155
|
## Important
|
|
139
156
|
- Do NOT keep retrying the same failing action. If something fails twice, call \`report\` with status "blocked".
|
|
@@ -159,6 +176,8 @@ var init_runner = __esm({
|
|
|
159
176
|
config;
|
|
160
177
|
toolContext;
|
|
161
178
|
initialized = false;
|
|
179
|
+
onStep = null;
|
|
180
|
+
roundsUsed = 0;
|
|
162
181
|
constructor(llmClient, contextManager, toolRegistry, config, toolContext) {
|
|
163
182
|
this.llmClient = llmClient;
|
|
164
183
|
this.contextManager = contextManager;
|
|
@@ -166,7 +185,30 @@ var init_runner = __esm({
|
|
|
166
185
|
this.config = config;
|
|
167
186
|
this.toolContext = toolContext;
|
|
168
187
|
}
|
|
188
|
+
/** Register a callback to receive step-by-step progress events */
|
|
189
|
+
setOnStep(cb) {
|
|
190
|
+
this.onStep = cb;
|
|
191
|
+
}
|
|
192
|
+
emit(event) {
|
|
193
|
+
this.onStep?.(event);
|
|
194
|
+
}
|
|
195
|
+
/** How many instruction rounds have been used in this session */
|
|
196
|
+
get currentRound() {
|
|
197
|
+
return this.roundsUsed;
|
|
198
|
+
}
|
|
199
|
+
/** Whether this session has exhausted its max rounds */
|
|
200
|
+
get roundsExhausted() {
|
|
201
|
+
return this.roundsUsed >= this.config.maxRounds;
|
|
202
|
+
}
|
|
169
203
|
async run(instruction) {
|
|
204
|
+
if (this.roundsExhausted) {
|
|
205
|
+
return {
|
|
206
|
+
status: "blocked",
|
|
207
|
+
content: `Session has reached the maximum number of instruction rounds (${this.config.maxRounds}). Create a new session to continue.`,
|
|
208
|
+
stepsUsed: 0
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
this.roundsUsed++;
|
|
170
212
|
if (!this.initialized) {
|
|
171
213
|
this.contextManager.append({
|
|
172
214
|
role: "system",
|
|
@@ -182,7 +224,7 @@ var init_runner = __esm({
|
|
|
182
224
|
while (stepsUsed < this.config.maxSteps) {
|
|
183
225
|
stepsUsed++;
|
|
184
226
|
const remaining = this.config.maxSteps - stepsUsed;
|
|
185
|
-
const messages = this.contextManager.
|
|
227
|
+
const messages = this.contextManager.getMessages();
|
|
186
228
|
if (remaining <= 3 && remaining >= 0) {
|
|
187
229
|
messages.push({
|
|
188
230
|
role: "system",
|
|
@@ -195,9 +237,10 @@ var init_runner = __esm({
|
|
|
195
237
|
response = await this.llmClient.chat(messages, tools);
|
|
196
238
|
} catch (err) {
|
|
197
239
|
const msg = err instanceof Error ? err.message : String(err);
|
|
240
|
+
this.emit({ type: "error", step: stepsUsed, message: `LLM API error: ${msg}` });
|
|
198
241
|
return {
|
|
199
242
|
status: "blocked",
|
|
200
|
-
|
|
243
|
+
content: `LLM API error: ${msg}`,
|
|
201
244
|
stepsUsed
|
|
202
245
|
};
|
|
203
246
|
}
|
|
@@ -205,26 +248,31 @@ var init_runner = __esm({
|
|
|
205
248
|
if (!choice) {
|
|
206
249
|
return {
|
|
207
250
|
status: "blocked",
|
|
208
|
-
|
|
251
|
+
content: "LLM returned empty response",
|
|
209
252
|
stepsUsed
|
|
210
253
|
};
|
|
211
254
|
}
|
|
212
255
|
const message = choice.message;
|
|
256
|
+
if (message.content) {
|
|
257
|
+
this.emit({ type: "thinking", step: stepsUsed, content: message.content });
|
|
258
|
+
}
|
|
213
259
|
if (choice.finish_reason === "stop" || !message.tool_calls?.length) {
|
|
214
260
|
const text = message.content ?? "";
|
|
215
261
|
this.contextManager.append({ role: "assistant", content: text });
|
|
216
262
|
return {
|
|
217
263
|
status: "need_guidance",
|
|
218
|
-
|
|
264
|
+
content: text || "Agent stopped without calling report.",
|
|
219
265
|
stepsUsed
|
|
220
266
|
};
|
|
221
267
|
}
|
|
222
268
|
this.contextManager.append(message);
|
|
223
269
|
for (const toolCall of message.tool_calls) {
|
|
270
|
+
const toolName = toolCall.function.name;
|
|
224
271
|
let args;
|
|
225
272
|
try {
|
|
226
273
|
args = JSON.parse(toolCall.function.arguments);
|
|
227
274
|
} catch {
|
|
275
|
+
this.emit({ type: "error", step: stepsUsed, message: `Failed to parse args for ${toolName}` });
|
|
228
276
|
this.contextManager.append({
|
|
229
277
|
role: "tool",
|
|
230
278
|
tool_call_id: toolCall.id,
|
|
@@ -232,15 +280,17 @@ var init_runner = __esm({
|
|
|
232
280
|
});
|
|
233
281
|
continue;
|
|
234
282
|
}
|
|
283
|
+
this.emit({ type: "tool_call", step: stepsUsed, name: toolName, args });
|
|
235
284
|
let result;
|
|
236
285
|
try {
|
|
237
286
|
result = await this.toolRegistry.execute(
|
|
238
|
-
|
|
287
|
+
toolName,
|
|
239
288
|
args,
|
|
240
289
|
this.toolContext
|
|
241
290
|
);
|
|
242
291
|
} catch (err) {
|
|
243
292
|
const msg = err instanceof Error ? err.message : String(err);
|
|
293
|
+
this.emit({ type: "error", step: stepsUsed, message: `${toolName} failed: ${msg}` });
|
|
244
294
|
this.contextManager.append({
|
|
245
295
|
role: "tool",
|
|
246
296
|
tool_call_id: toolCall.id,
|
|
@@ -249,6 +299,7 @@ var init_runner = __esm({
|
|
|
249
299
|
continue;
|
|
250
300
|
}
|
|
251
301
|
if (result.type === "report") {
|
|
302
|
+
this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: `[${result.status}] report submitted` });
|
|
252
303
|
this.contextManager.append({
|
|
253
304
|
role: "tool",
|
|
254
305
|
tool_call_id: toolCall.id,
|
|
@@ -256,18 +307,18 @@ var init_runner = __esm({
|
|
|
256
307
|
});
|
|
257
308
|
return {
|
|
258
309
|
status: result.status,
|
|
259
|
-
|
|
260
|
-
screenshot: result.screenshot,
|
|
310
|
+
content: result.content,
|
|
261
311
|
data: result.data,
|
|
262
312
|
stepsUsed
|
|
263
313
|
};
|
|
264
314
|
}
|
|
265
315
|
if (result.type === "image") {
|
|
316
|
+
this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: `Screenshot captured (${result.screenshotId})` });
|
|
266
317
|
this.contextManager.append({
|
|
267
318
|
role: "tool",
|
|
268
319
|
tool_call_id: toolCall.id,
|
|
269
320
|
content: [
|
|
270
|
-
{ type: "text", text:
|
|
321
|
+
{ type: "text", text: `Screenshot captured. ID: ${result.screenshotId}` },
|
|
271
322
|
{
|
|
272
323
|
type: "image_url",
|
|
273
324
|
image_url: {
|
|
@@ -277,6 +328,8 @@ var init_runner = __esm({
|
|
|
277
328
|
]
|
|
278
329
|
});
|
|
279
330
|
} else {
|
|
331
|
+
const preview = result.content.length > 200 ? result.content.slice(0, 200) + "..." : result.content;
|
|
332
|
+
this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: preview });
|
|
280
333
|
this.contextManager.append({
|
|
281
334
|
role: "tool",
|
|
282
335
|
tool_call_id: toolCall.id,
|
|
@@ -287,7 +340,7 @@ var init_runner = __esm({
|
|
|
287
340
|
}
|
|
288
341
|
return {
|
|
289
342
|
status: "blocked",
|
|
290
|
-
|
|
343
|
+
content: `Exceeded maximum steps limit (${this.config.maxSteps}). Task may be incomplete.`,
|
|
291
344
|
stepsUsed
|
|
292
345
|
};
|
|
293
346
|
}
|
|
@@ -296,27 +349,205 @@ var init_runner = __esm({
|
|
|
296
349
|
});
|
|
297
350
|
|
|
298
351
|
// src/tools/browser/client.ts
|
|
299
|
-
|
|
352
|
+
import { existsSync as existsSync2, mkdirSync, cpSync, readdirSync } from "fs";
|
|
353
|
+
import { spawn, execSync } from "child_process";
|
|
354
|
+
import { join as join2 } from "path";
|
|
355
|
+
import { homedir as homedir2 } from "os";
|
|
356
|
+
function findChrome() {
|
|
357
|
+
for (const p of CHROME_PATHS) {
|
|
358
|
+
if (p && existsSync2(p)) return p;
|
|
359
|
+
}
|
|
360
|
+
return null;
|
|
361
|
+
}
|
|
362
|
+
function findUserDataDir() {
|
|
363
|
+
const candidates = [
|
|
364
|
+
// Windows
|
|
365
|
+
join2(process.env.LOCALAPPDATA ?? "", "Google", "Chrome", "User Data"),
|
|
366
|
+
// macOS
|
|
367
|
+
join2(homedir2(), "Library", "Application Support", "Google", "Chrome"),
|
|
368
|
+
// Linux
|
|
369
|
+
join2(homedir2(), ".config", "google-chrome"),
|
|
370
|
+
join2(homedir2(), ".config", "chromium")
|
|
371
|
+
];
|
|
372
|
+
for (const p of candidates) {
|
|
373
|
+
if (p && existsSync2(p)) return p;
|
|
374
|
+
}
|
|
375
|
+
return null;
|
|
376
|
+
}
|
|
377
|
+
function getCdpPort(cdpUrl) {
|
|
378
|
+
try {
|
|
379
|
+
return parseInt(new URL(cdpUrl).port, 10) || 9222;
|
|
380
|
+
} catch {
|
|
381
|
+
return 9222;
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
function isChromeRunning() {
|
|
385
|
+
try {
|
|
386
|
+
if (process.platform === "win32") {
|
|
387
|
+
const out = execSync('tasklist /FI "IMAGENAME eq chrome.exe" /NH', {
|
|
388
|
+
encoding: "utf-8",
|
|
389
|
+
windowsHide: true
|
|
390
|
+
});
|
|
391
|
+
return out.includes("chrome.exe");
|
|
392
|
+
} else {
|
|
393
|
+
execSync('pgrep -x "chrome|chromium|google-chrome"', { encoding: "utf-8" });
|
|
394
|
+
return true;
|
|
395
|
+
}
|
|
396
|
+
} catch {
|
|
397
|
+
return false;
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
function syncProfile(sourceDir, targetDir) {
|
|
401
|
+
mkdirSync(targetDir, { recursive: true });
|
|
402
|
+
const entries = readdirSync(sourceDir, { withFileTypes: true });
|
|
403
|
+
for (const entry of entries) {
|
|
404
|
+
const src = join2(sourceDir, entry.name);
|
|
405
|
+
const dst = join2(targetDir, entry.name);
|
|
406
|
+
if (entry.isFile()) {
|
|
407
|
+
try {
|
|
408
|
+
cpSync(src, dst, { force: true });
|
|
409
|
+
} catch {
|
|
410
|
+
}
|
|
411
|
+
} else if (entry.isDirectory()) {
|
|
412
|
+
if (entry.name === "Default" || entry.name.startsWith("Profile ")) {
|
|
413
|
+
syncProfileDir(src, dst);
|
|
414
|
+
} else if (!SKIP_DIRS.has(entry.name)) {
|
|
415
|
+
try {
|
|
416
|
+
cpSync(src, dst, { recursive: true, force: true });
|
|
417
|
+
} catch {
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
function syncProfileDir(sourceDir, targetDir) {
|
|
424
|
+
mkdirSync(targetDir, { recursive: true });
|
|
425
|
+
let entries;
|
|
426
|
+
try {
|
|
427
|
+
entries = readdirSync(sourceDir, { withFileTypes: true });
|
|
428
|
+
} catch {
|
|
429
|
+
return;
|
|
430
|
+
}
|
|
431
|
+
for (const entry of entries) {
|
|
432
|
+
if (SKIP_DIRS.has(entry.name)) continue;
|
|
433
|
+
const src = join2(sourceDir, entry.name);
|
|
434
|
+
const dst = join2(targetDir, entry.name);
|
|
435
|
+
try {
|
|
436
|
+
if (entry.isFile()) {
|
|
437
|
+
cpSync(src, dst, { force: true });
|
|
438
|
+
} else if (entry.isDirectory()) {
|
|
439
|
+
cpSync(src, dst, { recursive: true, force: true });
|
|
440
|
+
}
|
|
441
|
+
} catch {
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
var CHROME_PATHS, SKIP_DIRS, BrowserClient;
|
|
300
446
|
var init_client = __esm({
|
|
301
447
|
"src/tools/browser/client.ts"() {
|
|
302
448
|
"use strict";
|
|
449
|
+
CHROME_PATHS = [
|
|
450
|
+
// Windows
|
|
451
|
+
"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
|
|
452
|
+
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
|
|
453
|
+
`${process.env.LOCALAPPDATA ?? ""}\\Google\\Chrome\\Application\\chrome.exe`,
|
|
454
|
+
// macOS
|
|
455
|
+
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
|
456
|
+
// Linux
|
|
457
|
+
"/usr/bin/google-chrome",
|
|
458
|
+
"/usr/bin/google-chrome-stable",
|
|
459
|
+
"/usr/bin/chromium-browser",
|
|
460
|
+
"/usr/bin/chromium"
|
|
461
|
+
];
|
|
462
|
+
SKIP_DIRS = /* @__PURE__ */ new Set([
|
|
463
|
+
"Cache",
|
|
464
|
+
"Code Cache",
|
|
465
|
+
"GPUCache",
|
|
466
|
+
"Service Worker",
|
|
467
|
+
"CacheStorage",
|
|
468
|
+
"File System",
|
|
469
|
+
"blob_storage",
|
|
470
|
+
"IndexedDB",
|
|
471
|
+
"DawnCache",
|
|
472
|
+
"GrShaderCache",
|
|
473
|
+
"ShaderCache",
|
|
474
|
+
"optimization_guide_model_store",
|
|
475
|
+
"BrowserMetrics",
|
|
476
|
+
"Crashpad",
|
|
477
|
+
"component_crx_cache"
|
|
478
|
+
]);
|
|
303
479
|
BrowserClient = class {
|
|
304
480
|
browser = null;
|
|
305
481
|
context = null;
|
|
306
482
|
_page = null;
|
|
307
483
|
cdpUrl;
|
|
484
|
+
chromeProcess = null;
|
|
308
485
|
constructor(cdpUrl) {
|
|
309
486
|
this.cdpUrl = cdpUrl;
|
|
310
487
|
}
|
|
311
488
|
async connect() {
|
|
312
489
|
if (this.browser) return;
|
|
313
490
|
const { chromium } = await import("playwright");
|
|
314
|
-
|
|
491
|
+
try {
|
|
492
|
+
this.browser = await chromium.connectOverCDP(this.cdpUrl);
|
|
493
|
+
} catch {
|
|
494
|
+
await this.launchChrome();
|
|
495
|
+
this.browser = await chromium.connectOverCDP(this.cdpUrl);
|
|
496
|
+
}
|
|
315
497
|
const contexts = this.browser.contexts();
|
|
316
498
|
this.context = contexts[0] ?? await this.browser.newContext();
|
|
317
499
|
const pages = this.context.pages();
|
|
318
500
|
this._page = pages[0] ?? await this.context.newPage();
|
|
319
501
|
}
|
|
502
|
+
async launchChrome() {
|
|
503
|
+
const chromePath = findChrome();
|
|
504
|
+
if (!chromePath) {
|
|
505
|
+
throw new Error(
|
|
506
|
+
"Chrome not found. Please install Chrome or start it manually with: chrome --remote-debugging-port=9222"
|
|
507
|
+
);
|
|
508
|
+
}
|
|
509
|
+
const port = getCdpPort(this.cdpUrl);
|
|
510
|
+
if (isChromeRunning()) {
|
|
511
|
+
console.error("[windows-use] Chrome is running without CDP. Restarting with --remote-debugging-port...");
|
|
512
|
+
try {
|
|
513
|
+
if (process.platform === "win32") {
|
|
514
|
+
execSync("taskkill /F /IM chrome.exe /T", { windowsHide: true, stdio: "ignore" });
|
|
515
|
+
} else {
|
|
516
|
+
execSync("pkill -f chrome", { stdio: "ignore" });
|
|
517
|
+
}
|
|
518
|
+
} catch {
|
|
519
|
+
}
|
|
520
|
+
await new Promise((r) => setTimeout(r, 1500));
|
|
521
|
+
}
|
|
522
|
+
const targetDir = join2(homedir2(), ".windows-use", "chrome-profile");
|
|
523
|
+
const userDir = findUserDataDir();
|
|
524
|
+
if (userDir) {
|
|
525
|
+
console.error("[windows-use] Syncing Chrome profile (cookies, login state)...");
|
|
526
|
+
syncProfile(userDir, targetDir);
|
|
527
|
+
console.error("[windows-use] Profile synced.");
|
|
528
|
+
} else {
|
|
529
|
+
mkdirSync(targetDir, { recursive: true });
|
|
530
|
+
}
|
|
531
|
+
console.error(`[windows-use] Launching Chrome with --remote-debugging-port=${port}`);
|
|
532
|
+
this.chromeProcess = spawn(
|
|
533
|
+
chromePath,
|
|
534
|
+
[
|
|
535
|
+
`--remote-debugging-port=${port}`,
|
|
536
|
+
`--user-data-dir=${targetDir}`
|
|
537
|
+
],
|
|
538
|
+
{ detached: true, stdio: "ignore" }
|
|
539
|
+
);
|
|
540
|
+
this.chromeProcess.unref();
|
|
541
|
+
for (let i = 0; i < 30; i++) {
|
|
542
|
+
try {
|
|
543
|
+
const res = await fetch(`http://localhost:${port}/json/version`);
|
|
544
|
+
if (res.ok) return;
|
|
545
|
+
} catch {
|
|
546
|
+
}
|
|
547
|
+
await new Promise((r) => setTimeout(r, 500));
|
|
548
|
+
}
|
|
549
|
+
throw new Error("Chrome launched but CDP endpoint did not become available within 15s");
|
|
550
|
+
}
|
|
320
551
|
async getPage() {
|
|
321
552
|
await this.connect();
|
|
322
553
|
return this._page;
|
|
@@ -463,17 +694,79 @@ var init_registry = __esm({
|
|
|
463
694
|
}
|
|
464
695
|
});
|
|
465
696
|
|
|
697
|
+
// src/tools/windows/grid-overlay.ts
|
|
698
|
+
import sharp from "sharp";
|
|
699
|
+
async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
|
|
700
|
+
const gridSpacing = options.gridSpacing ?? 100;
|
|
701
|
+
const labelSpacing = options.labelSpacing ?? 200;
|
|
702
|
+
const majorSpacing = gridSpacing * 5;
|
|
703
|
+
const svgParts = [];
|
|
704
|
+
for (let x = gridSpacing; x < width; x += gridSpacing) {
|
|
705
|
+
const isMajor = x % majorSpacing === 0;
|
|
706
|
+
const opacity = isMajor ? 0.35 : 0.15;
|
|
707
|
+
const sw = isMajor ? 1.5 : 0.5;
|
|
708
|
+
svgParts.push(
|
|
709
|
+
`<line x1="${x}" y1="0" x2="${x}" y2="${height}" stroke="rgba(255,50,50,${opacity})" stroke-width="${sw}"/>`
|
|
710
|
+
);
|
|
711
|
+
}
|
|
712
|
+
for (let y = gridSpacing; y < height; y += gridSpacing) {
|
|
713
|
+
const isMajor = y % majorSpacing === 0;
|
|
714
|
+
const opacity = isMajor ? 0.35 : 0.15;
|
|
715
|
+
const sw = isMajor ? 1.5 : 0.5;
|
|
716
|
+
svgParts.push(
|
|
717
|
+
`<line x1="0" y1="${y}" x2="${width}" y2="${y}" stroke="rgba(255,50,50,${opacity})" stroke-width="${sw}"/>`
|
|
718
|
+
);
|
|
719
|
+
}
|
|
720
|
+
for (let x = labelSpacing; x < width; x += labelSpacing) {
|
|
721
|
+
const text = String(x);
|
|
722
|
+
const tw = text.length * 7.5 + 6;
|
|
723
|
+
svgParts.push(
|
|
724
|
+
`<rect x="${x - tw / 2}" y="2" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
|
|
725
|
+
`<text x="${x}" y="14" text-anchor="middle" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${text}</text>`
|
|
726
|
+
);
|
|
727
|
+
}
|
|
728
|
+
for (let y = labelSpacing; y < height; y += labelSpacing) {
|
|
729
|
+
const text = String(y);
|
|
730
|
+
const tw = text.length * 7.5 + 6;
|
|
731
|
+
svgParts.push(
|
|
732
|
+
`<rect x="2" y="${y - 8}" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
|
|
733
|
+
`<text x="5" y="${y + 4}" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${text}</text>`
|
|
734
|
+
);
|
|
735
|
+
}
|
|
736
|
+
svgParts.push(
|
|
737
|
+
`<rect x="2" y="2" width="22" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
|
|
738
|
+
`<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">0,0</text>`
|
|
739
|
+
);
|
|
740
|
+
const dimText = `${width}x${height}`;
|
|
741
|
+
const dimTw = dimText.length * 7.5 + 6;
|
|
742
|
+
svgParts.push(
|
|
743
|
+
`<rect x="${width - dimTw - 2}" y="${height - 18}" width="${dimTw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
|
|
744
|
+
`<text x="${width - dimTw / 2 - 2}" y="${height - 6}" text-anchor="middle" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${dimText}</text>`
|
|
745
|
+
);
|
|
746
|
+
const svg = Buffer.from(
|
|
747
|
+
`<svg width="${width}" height="${height}" xmlns="http://www.w3.org/2000/svg">${svgParts.join("")}</svg>`
|
|
748
|
+
);
|
|
749
|
+
return sharp(imageBuffer).composite([{ input: svg, top: 0, left: 0 }]).jpeg({ quality: 70 }).toBuffer();
|
|
750
|
+
}
|
|
751
|
+
var init_grid_overlay = __esm({
|
|
752
|
+
"src/tools/windows/grid-overlay.ts"() {
|
|
753
|
+
"use strict";
|
|
754
|
+
}
|
|
755
|
+
});
|
|
756
|
+
|
|
466
757
|
// src/tools/windows/screenshot.ts
|
|
467
758
|
import { z as z2 } from "zod";
|
|
759
|
+
import sharp2 from "sharp";
|
|
468
760
|
var screenshotTool;
|
|
469
761
|
var init_screenshot = __esm({
|
|
470
762
|
"src/tools/windows/screenshot.ts"() {
|
|
471
763
|
"use strict";
|
|
764
|
+
init_grid_overlay();
|
|
472
765
|
screenshotTool = {
|
|
473
766
|
name: "screenshot",
|
|
474
|
-
description: "Capture the full screen
|
|
767
|
+
description: "Capture the full screen with a coordinate grid overlay. The grid shows pixel coordinates that match mouse_click/mouse_move coordinates. Returns a screenshot ID.",
|
|
475
768
|
parameters: z2.object({}),
|
|
476
|
-
async execute() {
|
|
769
|
+
async execute(_args, ctx) {
|
|
477
770
|
const { Monitor } = await import("node-screenshots");
|
|
478
771
|
const monitors = Monitor.all();
|
|
479
772
|
const primary = monitors.find((m) => m.isPrimary()) ?? monitors[0];
|
|
@@ -481,11 +774,24 @@ var init_screenshot = __esm({
|
|
|
481
774
|
return { type: "text", content: "Error: No monitor found" };
|
|
482
775
|
}
|
|
483
776
|
const image = primary.captureImageSync();
|
|
484
|
-
const
|
|
777
|
+
const physW = image.width;
|
|
778
|
+
const physH = image.height;
|
|
779
|
+
const scaleFactor = primary.scaleFactor ?? 1;
|
|
780
|
+
const logicalW = Math.round(physW / scaleFactor);
|
|
781
|
+
const logicalH = Math.round(physH / scaleFactor);
|
|
782
|
+
const raw = image.toRawSync();
|
|
783
|
+
const resized = await sharp2(raw, {
|
|
784
|
+
raw: { width: physW, height: physH, channels: 4 }
|
|
785
|
+
}).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
|
|
786
|
+
const cleanBase64 = resized.toString("base64");
|
|
787
|
+
const id = ctx.screenshots.save(cleanBase64, "image/jpeg", "desktop");
|
|
788
|
+
const gridImage = await addCoordinateGrid(resized, logicalW, logicalH);
|
|
789
|
+
const gridBase64 = gridImage.toString("base64");
|
|
485
790
|
return {
|
|
486
791
|
type: "image",
|
|
487
|
-
base64:
|
|
488
|
-
mimeType: "image/
|
|
792
|
+
base64: gridBase64,
|
|
793
|
+
mimeType: "image/jpeg",
|
|
794
|
+
screenshotId: id
|
|
489
795
|
};
|
|
490
796
|
}
|
|
491
797
|
};
|
|
@@ -752,8 +1058,47 @@ var init_write = __esm({
|
|
|
752
1058
|
}
|
|
753
1059
|
});
|
|
754
1060
|
|
|
755
|
-
// src/tools/
|
|
1061
|
+
// src/tools/file/image.ts
|
|
756
1062
|
import { z as z8 } from "zod";
|
|
1063
|
+
import { readFileSync as readFileSync2, existsSync as existsSync3 } from "fs";
|
|
1064
|
+
import { extname } from "path";
|
|
1065
|
+
var IMAGE_EXTS, useLocalImageTool;
|
|
1066
|
+
var init_image = __esm({
|
|
1067
|
+
"src/tools/file/image.ts"() {
|
|
1068
|
+
"use strict";
|
|
1069
|
+
IMAGE_EXTS = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".bmp", ".webp"]);
|
|
1070
|
+
useLocalImageTool = {
|
|
1071
|
+
name: "use_local_image",
|
|
1072
|
+
description: "Load a local image file and get a screenshot ID for it. Use this to reference local images in your report via [Image:img_X].",
|
|
1073
|
+
parameters: z8.object({
|
|
1074
|
+
path: z8.string().describe("Absolute path to the image file"),
|
|
1075
|
+
label: z8.string().default("local").describe('Label for the image (e.g. "chart", "photo")')
|
|
1076
|
+
}),
|
|
1077
|
+
async execute(args, ctx) {
|
|
1078
|
+
if (!existsSync3(args.path)) {
|
|
1079
|
+
return { type: "text", content: `Error: File not found: ${args.path}` };
|
|
1080
|
+
}
|
|
1081
|
+
const ext = extname(args.path).toLowerCase();
|
|
1082
|
+
if (!IMAGE_EXTS.has(ext)) {
|
|
1083
|
+
return { type: "text", content: `Error: Not a supported image format (${ext}). Supported: ${[...IMAGE_EXTS].join(", ")}` };
|
|
1084
|
+
}
|
|
1085
|
+
const buf = readFileSync2(args.path);
|
|
1086
|
+
const mimeType = ext === ".png" ? "image/png" : "image/jpeg";
|
|
1087
|
+
const base64 = buf.toString("base64");
|
|
1088
|
+
const id = ctx.screenshots.save(base64, mimeType, args.label);
|
|
1089
|
+
return {
|
|
1090
|
+
type: "image",
|
|
1091
|
+
base64,
|
|
1092
|
+
mimeType,
|
|
1093
|
+
screenshotId: id
|
|
1094
|
+
};
|
|
1095
|
+
}
|
|
1096
|
+
};
|
|
1097
|
+
}
|
|
1098
|
+
});
|
|
1099
|
+
|
|
1100
|
+
// src/tools/browser/navigate.ts
|
|
1101
|
+
import { z as z9 } from "zod";
|
|
757
1102
|
var browserNavigateTool;
|
|
758
1103
|
var init_navigate = __esm({
|
|
759
1104
|
"src/tools/browser/navigate.ts"() {
|
|
@@ -761,8 +1106,8 @@ var init_navigate = __esm({
|
|
|
761
1106
|
browserNavigateTool = {
|
|
762
1107
|
name: "browser_navigate",
|
|
763
1108
|
description: "Navigate the browser to a URL.",
|
|
764
|
-
parameters:
|
|
765
|
-
url:
|
|
1109
|
+
parameters: z9.object({
|
|
1110
|
+
url: z9.string().describe("The URL to navigate to")
|
|
766
1111
|
}),
|
|
767
1112
|
async execute(args, ctx) {
|
|
768
1113
|
const browser = await ctx.getBrowser();
|
|
@@ -777,7 +1122,7 @@ Page title: ${title}` };
|
|
|
777
1122
|
});
|
|
778
1123
|
|
|
779
1124
|
// src/tools/browser/click.ts
|
|
780
|
-
import { z as
|
|
1125
|
+
import { z as z10 } from "zod";
|
|
781
1126
|
var browserClickTool;
|
|
782
1127
|
var init_click = __esm({
|
|
783
1128
|
"src/tools/browser/click.ts"() {
|
|
@@ -785,8 +1130,8 @@ var init_click = __esm({
|
|
|
785
1130
|
browserClickTool = {
|
|
786
1131
|
name: "browser_click",
|
|
787
1132
|
description: "Click an element on the web page using a CSS selector or text content.",
|
|
788
|
-
parameters:
|
|
789
|
-
selector:
|
|
1133
|
+
parameters: z10.object({
|
|
1134
|
+
selector: z10.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
|
|
790
1135
|
}),
|
|
791
1136
|
async execute(args, ctx) {
|
|
792
1137
|
const browser = await ctx.getBrowser();
|
|
@@ -799,7 +1144,7 @@ var init_click = __esm({
|
|
|
799
1144
|
});
|
|
800
1145
|
|
|
801
1146
|
// src/tools/browser/type.ts
|
|
802
|
-
import { z as
|
|
1147
|
+
import { z as z11 } from "zod";
|
|
803
1148
|
var browserTypeTool;
|
|
804
1149
|
var init_type = __esm({
|
|
805
1150
|
"src/tools/browser/type.ts"() {
|
|
@@ -807,10 +1152,10 @@ var init_type = __esm({
|
|
|
807
1152
|
browserTypeTool = {
|
|
808
1153
|
name: "browser_type",
|
|
809
1154
|
description: "Type text into an input field on the web page.",
|
|
810
|
-
parameters:
|
|
811
|
-
selector:
|
|
812
|
-
text:
|
|
813
|
-
clear:
|
|
1155
|
+
parameters: z11.object({
|
|
1156
|
+
selector: z11.string().describe("CSS selector for the input element"),
|
|
1157
|
+
text: z11.string().describe("Text to type"),
|
|
1158
|
+
clear: z11.boolean().default(true).describe("Whether to clear the field before typing")
|
|
814
1159
|
}),
|
|
815
1160
|
async execute(args, ctx) {
|
|
816
1161
|
const browser = await ctx.getBrowser();
|
|
@@ -827,28 +1172,33 @@ var init_type = __esm({
|
|
|
827
1172
|
});
|
|
828
1173
|
|
|
829
1174
|
// src/tools/browser/screenshot.ts
|
|
830
|
-
import { z as
|
|
1175
|
+
import { z as z12 } from "zod";
|
|
831
1176
|
var browserScreenshotTool;
|
|
832
1177
|
var init_screenshot2 = __esm({
|
|
833
1178
|
"src/tools/browser/screenshot.ts"() {
|
|
834
1179
|
"use strict";
|
|
835
1180
|
browserScreenshotTool = {
|
|
836
1181
|
name: "browser_screenshot",
|
|
837
|
-
description: "Take a screenshot of the current browser page.",
|
|
838
|
-
parameters:
|
|
839
|
-
fullPage:
|
|
1182
|
+
description: "Take a screenshot of the current browser page. Returns a screenshot ID (e.g. img_2) that you can reference later in report.",
|
|
1183
|
+
parameters: z12.object({
|
|
1184
|
+
fullPage: z12.boolean().default(false).describe("Whether to capture the full scrollable page")
|
|
840
1185
|
}),
|
|
841
1186
|
async execute(args, ctx) {
|
|
842
1187
|
const browser = await ctx.getBrowser();
|
|
843
1188
|
const page = await browser.getPage();
|
|
844
1189
|
const buf = await page.screenshot({
|
|
845
|
-
type: "
|
|
846
|
-
|
|
1190
|
+
type: "jpeg",
|
|
1191
|
+
quality: 70,
|
|
1192
|
+
fullPage: args.fullPage,
|
|
1193
|
+
scale: "css"
|
|
847
1194
|
});
|
|
1195
|
+
const base64 = buf.toString("base64");
|
|
1196
|
+
const id = ctx.screenshots.save(base64, "image/jpeg", "browser");
|
|
848
1197
|
return {
|
|
849
1198
|
type: "image",
|
|
850
|
-
base64
|
|
851
|
-
mimeType: "image/
|
|
1199
|
+
base64,
|
|
1200
|
+
mimeType: "image/jpeg",
|
|
1201
|
+
screenshotId: id
|
|
852
1202
|
};
|
|
853
1203
|
}
|
|
854
1204
|
};
|
|
@@ -856,7 +1206,7 @@ var init_screenshot2 = __esm({
|
|
|
856
1206
|
});
|
|
857
1207
|
|
|
858
1208
|
// src/tools/browser/content.ts
|
|
859
|
-
import { z as
|
|
1209
|
+
import { z as z13 } from "zod";
|
|
860
1210
|
var MAX_CONTENT_LENGTH, browserContentTool;
|
|
861
1211
|
var init_content = __esm({
|
|
862
1212
|
"src/tools/browser/content.ts"() {
|
|
@@ -865,7 +1215,7 @@ var init_content = __esm({
|
|
|
865
1215
|
browserContentTool = {
|
|
866
1216
|
name: "browser_content",
|
|
867
1217
|
description: "Get the text content of the current web page. Returns visible text, not HTML.",
|
|
868
|
-
parameters:
|
|
1218
|
+
parameters: z13.object({}),
|
|
869
1219
|
async execute(_args, ctx) {
|
|
870
1220
|
const browser = await ctx.getBrowser();
|
|
871
1221
|
const page = await browser.getPage();
|
|
@@ -888,7 +1238,7 @@ ${text}`
|
|
|
888
1238
|
});
|
|
889
1239
|
|
|
890
1240
|
// src/tools/browser/scroll.ts
|
|
891
|
-
import { z as
|
|
1241
|
+
import { z as z14 } from "zod";
|
|
892
1242
|
var browserScrollTool;
|
|
893
1243
|
var init_scroll = __esm({
|
|
894
1244
|
"src/tools/browser/scroll.ts"() {
|
|
@@ -896,9 +1246,9 @@ var init_scroll = __esm({
|
|
|
896
1246
|
browserScrollTool = {
|
|
897
1247
|
name: "browser_scroll",
|
|
898
1248
|
description: "Scroll the current web page.",
|
|
899
|
-
parameters:
|
|
900
|
-
direction:
|
|
901
|
-
amount:
|
|
1249
|
+
parameters: z14.object({
|
|
1250
|
+
direction: z14.enum(["up", "down"]).describe("Scroll direction"),
|
|
1251
|
+
amount: z14.number().positive().default(500).describe("Pixels to scroll")
|
|
902
1252
|
}),
|
|
903
1253
|
async execute(args, ctx) {
|
|
904
1254
|
const browser = await ctx.getBrowser();
|
|
@@ -912,42 +1262,26 @@ var init_scroll = __esm({
|
|
|
912
1262
|
});
|
|
913
1263
|
|
|
914
1264
|
// src/tools/control/report.ts
|
|
915
|
-
import { z as
|
|
1265
|
+
import { z as z15 } from "zod";
|
|
916
1266
|
var reportTool;
|
|
917
1267
|
var init_report = __esm({
|
|
918
1268
|
"src/tools/control/report.ts"() {
|
|
919
1269
|
"use strict";
|
|
920
1270
|
reportTool = {
|
|
921
1271
|
name: "report",
|
|
922
|
-
description:
|
|
923
|
-
parameters:
|
|
924
|
-
status:
|
|
1272
|
+
description: 'Report progress back to the caller. Call this when the task is completed, when you are blocked, or when you need guidance. Calling this STOPS your execution immediately.\n\nThe content field supports rich document format: mix text with screenshots using [Image:img_1] markers. Example:\n"Here is the current state:\n[Image:img_2]\nThe page shows..."',
|
|
1273
|
+
parameters: z15.object({
|
|
1274
|
+
status: z15.enum(["completed", "blocked", "need_guidance"]).describe(
|
|
925
1275
|
'"completed" = task done, "blocked" = cannot proceed, "need_guidance" = need a decision'
|
|
926
1276
|
),
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
data: z14.unknown().optional().describe("Optional structured data to return")
|
|
1277
|
+
content: z15.string().describe('Rich report content. Use [Image:img_X] to embed screenshots captured earlier. Example: "Task done.\\n[Image:img_1]\\nThe page shows the result."'),
|
|
1278
|
+
data: z15.unknown().optional().describe("Optional structured data to return")
|
|
930
1279
|
}),
|
|
931
1280
|
async execute(args) {
|
|
932
|
-
let screenshot;
|
|
933
|
-
if (args.include_screenshot) {
|
|
934
|
-
try {
|
|
935
|
-
const { Monitor } = await import("node-screenshots");
|
|
936
|
-
const monitors = Monitor.all();
|
|
937
|
-
const primary = monitors.find((m) => m.isPrimary()) ?? monitors[0];
|
|
938
|
-
if (primary) {
|
|
939
|
-
const image = primary.captureImageSync();
|
|
940
|
-
const buf = image.toPngSync();
|
|
941
|
-
screenshot = buf.toString("base64");
|
|
942
|
-
}
|
|
943
|
-
} catch {
|
|
944
|
-
}
|
|
945
|
-
}
|
|
946
1281
|
return {
|
|
947
1282
|
type: "report",
|
|
948
1283
|
status: args.status,
|
|
949
|
-
|
|
950
|
-
screenshot,
|
|
1284
|
+
content: args.content,
|
|
951
1285
|
data: args.data
|
|
952
1286
|
};
|
|
953
1287
|
}
|
|
@@ -967,6 +1301,7 @@ function createToolRegistry() {
|
|
|
967
1301
|
registry2.register(runCommandTool);
|
|
968
1302
|
registry2.register(fileReadTool);
|
|
969
1303
|
registry2.register(fileWriteTool);
|
|
1304
|
+
registry2.register(useLocalImageTool);
|
|
970
1305
|
registry2.register(browserNavigateTool);
|
|
971
1306
|
registry2.register(browserClickTool);
|
|
972
1307
|
registry2.register(browserTypeTool);
|
|
@@ -986,6 +1321,7 @@ var init_tools = __esm({
|
|
|
986
1321
|
init_command();
|
|
987
1322
|
init_read();
|
|
988
1323
|
init_write();
|
|
1324
|
+
init_image();
|
|
989
1325
|
init_navigate();
|
|
990
1326
|
init_click();
|
|
991
1327
|
init_type();
|
|
@@ -996,6 +1332,59 @@ var init_tools = __esm({
|
|
|
996
1332
|
}
|
|
997
1333
|
});
|
|
998
1334
|
|
|
1335
|
+
// src/tools/types.ts
|
|
1336
|
+
function parseReportContent(content, store) {
|
|
1337
|
+
const blocks = [];
|
|
1338
|
+
const regex = /\[Image:(img_\d+)\]/g;
|
|
1339
|
+
let lastIndex = 0;
|
|
1340
|
+
let match;
|
|
1341
|
+
while ((match = regex.exec(content)) !== null) {
|
|
1342
|
+
if (match.index > lastIndex) {
|
|
1343
|
+
blocks.push({ type: "text", text: content.slice(lastIndex, match.index) });
|
|
1344
|
+
}
|
|
1345
|
+
const id = match[1];
|
|
1346
|
+
const screenshot = store.get(id);
|
|
1347
|
+
if (screenshot) {
|
|
1348
|
+
blocks.push({
|
|
1349
|
+
type: "image",
|
|
1350
|
+
id: screenshot.id,
|
|
1351
|
+
base64: screenshot.base64,
|
|
1352
|
+
mimeType: screenshot.mimeType,
|
|
1353
|
+
label: screenshot.label
|
|
1354
|
+
});
|
|
1355
|
+
} else {
|
|
1356
|
+
blocks.push({ type: "text", text: match[0] });
|
|
1357
|
+
}
|
|
1358
|
+
lastIndex = regex.lastIndex;
|
|
1359
|
+
}
|
|
1360
|
+
if (lastIndex < content.length) {
|
|
1361
|
+
blocks.push({ type: "text", text: content.slice(lastIndex) });
|
|
1362
|
+
}
|
|
1363
|
+
return blocks;
|
|
1364
|
+
}
|
|
1365
|
+
var ScreenshotStore;
|
|
1366
|
+
var init_types = __esm({
|
|
1367
|
+
"src/tools/types.ts"() {
|
|
1368
|
+
"use strict";
|
|
1369
|
+
ScreenshotStore = class {
|
|
1370
|
+
counter = 0;
|
|
1371
|
+
store = /* @__PURE__ */ new Map();
|
|
1372
|
+
save(base64, mimeType, label) {
|
|
1373
|
+
this.counter++;
|
|
1374
|
+
const id = `img_${this.counter}`;
|
|
1375
|
+
this.store.set(id, { id, base64, mimeType, label });
|
|
1376
|
+
return id;
|
|
1377
|
+
}
|
|
1378
|
+
get(id) {
|
|
1379
|
+
return this.store.get(id);
|
|
1380
|
+
}
|
|
1381
|
+
listIds() {
|
|
1382
|
+
return [...this.store.keys()];
|
|
1383
|
+
}
|
|
1384
|
+
};
|
|
1385
|
+
}
|
|
1386
|
+
});
|
|
1387
|
+
|
|
999
1388
|
// src/mcp/session-registry.ts
|
|
1000
1389
|
import crypto from "crypto";
|
|
1001
1390
|
var SessionRegistry;
|
|
@@ -1007,20 +1396,23 @@ var init_session_registry = __esm({
|
|
|
1007
1396
|
init_runner();
|
|
1008
1397
|
init_client();
|
|
1009
1398
|
init_tools();
|
|
1399
|
+
init_types();
|
|
1010
1400
|
SessionRegistry = class {
|
|
1011
1401
|
sessions = /* @__PURE__ */ new Map();
|
|
1012
1402
|
create(config) {
|
|
1013
1403
|
const id = crypto.randomUUID();
|
|
1014
|
-
const contextManager = new ContextManager(
|
|
1404
|
+
const contextManager = new ContextManager();
|
|
1015
1405
|
const llmClient = new LLMClient(config);
|
|
1016
1406
|
const browserClient = new BrowserClient(config.cdpUrl);
|
|
1017
1407
|
const toolRegistry = createToolRegistry();
|
|
1408
|
+
const screenshotStore = new ScreenshotStore();
|
|
1018
1409
|
const toolContext = {
|
|
1019
1410
|
sessionId: id,
|
|
1020
1411
|
cdpUrl: config.cdpUrl,
|
|
1021
1412
|
getBrowser: () => {
|
|
1022
1413
|
return browserClient.connect().then(() => browserClient);
|
|
1023
|
-
}
|
|
1414
|
+
},
|
|
1415
|
+
screenshots: screenshotStore
|
|
1024
1416
|
};
|
|
1025
1417
|
const runner = new AgentRunner(
|
|
1026
1418
|
llmClient,
|
|
@@ -1040,6 +1432,7 @@ var init_session_registry = __esm({
|
|
|
1040
1432
|
config,
|
|
1041
1433
|
runner,
|
|
1042
1434
|
browserClient,
|
|
1435
|
+
screenshots: screenshotStore,
|
|
1043
1436
|
timeoutHandle
|
|
1044
1437
|
};
|
|
1045
1438
|
this.sessions.set(id, session);
|
|
@@ -1076,18 +1469,19 @@ var init_session_registry = __esm({
|
|
|
1076
1469
|
});
|
|
1077
1470
|
|
|
1078
1471
|
// src/mcp/tools.ts
|
|
1079
|
-
import { z as
|
|
1472
|
+
import { z as z16 } from "zod";
|
|
1080
1473
|
function registerMcpTools(server2, registry2) {
|
|
1081
1474
|
server2.tool(
|
|
1082
1475
|
"create_session",
|
|
1083
1476
|
"Create a new automation session with a small LLM agent. Returns a session_id.",
|
|
1084
1477
|
{
|
|
1085
|
-
api_key:
|
|
1086
|
-
base_url:
|
|
1087
|
-
model:
|
|
1088
|
-
cdp_url:
|
|
1089
|
-
timeout_ms:
|
|
1090
|
-
max_steps:
|
|
1478
|
+
api_key: z16.string().optional().describe("LLM API key (or set WINDOWS_USE_API_KEY env)"),
|
|
1479
|
+
base_url: z16.string().optional().describe("OpenAI-compatible base URL (or set WINDOWS_USE_BASE_URL env)"),
|
|
1480
|
+
model: z16.string().optional().describe("Model name (or set WINDOWS_USE_MODEL env)"),
|
|
1481
|
+
cdp_url: z16.string().optional().describe("Chrome CDP URL (default: http://localhost:9222)"),
|
|
1482
|
+
timeout_ms: z16.number().optional().describe("Session inactivity timeout in ms (default: 300000)"),
|
|
1483
|
+
max_steps: z16.number().optional().describe("Max tool-calling steps per instruction (default: 50)"),
|
|
1484
|
+
max_rounds: z16.number().optional().describe("Max instruction rounds per session (default: 20)")
|
|
1091
1485
|
},
|
|
1092
1486
|
async (args) => {
|
|
1093
1487
|
const config = loadConfig({
|
|
@@ -1096,7 +1490,8 @@ function registerMcpTools(server2, registry2) {
|
|
|
1096
1490
|
model: args.model,
|
|
1097
1491
|
cdpUrl: args.cdp_url,
|
|
1098
1492
|
timeoutMs: args.timeout_ms,
|
|
1099
|
-
maxSteps: args.max_steps
|
|
1493
|
+
maxSteps: args.max_steps,
|
|
1494
|
+
maxRounds: args.max_rounds
|
|
1100
1495
|
});
|
|
1101
1496
|
const session = registry2.create(config);
|
|
1102
1497
|
return {
|
|
@@ -1111,10 +1506,10 @@ function registerMcpTools(server2, registry2) {
|
|
|
1111
1506
|
);
|
|
1112
1507
|
server2.tool(
|
|
1113
1508
|
"send_instruction",
|
|
1114
|
-
"Send a task instruction to the agent in a session. The agent executes it and returns a
|
|
1509
|
+
"Send a task instruction to the agent in a session. The agent executes it and returns a rich report with text and images.",
|
|
1115
1510
|
{
|
|
1116
|
-
session_id:
|
|
1117
|
-
instruction:
|
|
1511
|
+
session_id: z16.string().describe("Session ID from create_session"),
|
|
1512
|
+
instruction: z16.string().describe("What you want the agent to do, in natural language")
|
|
1118
1513
|
},
|
|
1119
1514
|
async (args) => {
|
|
1120
1515
|
const session = registry2.get(args.session_id);
|
|
@@ -1134,32 +1529,37 @@ function registerMcpTools(server2, registry2) {
|
|
|
1134
1529
|
registry2.touch(args.session_id);
|
|
1135
1530
|
const result = await session.runner.run(args.instruction);
|
|
1136
1531
|
registry2.touch(args.session_id);
|
|
1137
|
-
const
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
}
|
|
1532
|
+
const mcpContent = [];
|
|
1533
|
+
mcpContent.push({
|
|
1534
|
+
type: "text",
|
|
1535
|
+
text: JSON.stringify({
|
|
1536
|
+
status: result.status,
|
|
1537
|
+
steps_used: result.stepsUsed,
|
|
1538
|
+
round: session.runner.currentRound,
|
|
1539
|
+
rounds_remaining: session.config.maxRounds - session.runner.currentRound,
|
|
1540
|
+
...result.data !== void 0 ? { data: result.data } : {}
|
|
1541
|
+
})
|
|
1542
|
+
});
|
|
1543
|
+
const blocks = parseReportContent(result.content, session.screenshots);
|
|
1544
|
+
for (const block of blocks) {
|
|
1545
|
+
if (block.type === "text") {
|
|
1546
|
+
mcpContent.push({ type: "text", text: block.text });
|
|
1547
|
+
} else {
|
|
1548
|
+
mcpContent.push({
|
|
1549
|
+
type: "image",
|
|
1550
|
+
data: block.base64,
|
|
1551
|
+
mimeType: block.mimeType
|
|
1552
|
+
});
|
|
1146
1553
|
}
|
|
1147
|
-
];
|
|
1148
|
-
if (result.screenshot) {
|
|
1149
|
-
content.push({
|
|
1150
|
-
type: "image",
|
|
1151
|
-
data: result.screenshot,
|
|
1152
|
-
mimeType: "image/png"
|
|
1153
|
-
});
|
|
1154
1554
|
}
|
|
1155
|
-
return { content };
|
|
1555
|
+
return { content: mcpContent };
|
|
1156
1556
|
}
|
|
1157
1557
|
);
|
|
1158
1558
|
server2.tool(
|
|
1159
1559
|
"done_session",
|
|
1160
1560
|
"Terminate a session and free all resources.",
|
|
1161
1561
|
{
|
|
1162
|
-
session_id:
|
|
1562
|
+
session_id: z16.string().describe("Session ID to terminate")
|
|
1163
1563
|
},
|
|
1164
1564
|
async (args) => {
|
|
1165
1565
|
await registry2.destroy(args.session_id);
|
|
@@ -1178,6 +1578,7 @@ var init_tools2 = __esm({
|
|
|
1178
1578
|
"src/mcp/tools.ts"() {
|
|
1179
1579
|
"use strict";
|
|
1180
1580
|
init_loader();
|
|
1581
|
+
init_types();
|
|
1181
1582
|
}
|
|
1182
1583
|
});
|
|
1183
1584
|
|
|
@@ -1212,9 +1613,67 @@ var init_server = __esm({
|
|
|
1212
1613
|
// src/cli.ts
|
|
1213
1614
|
init_loader();
|
|
1214
1615
|
init_session_registry();
|
|
1616
|
+
init_types();
|
|
1215
1617
|
import { program } from "commander";
|
|
1216
|
-
|
|
1217
|
-
|
|
1618
|
+
import { createInterface } from "readline";
|
|
1619
|
+
import { createServer } from "http";
|
|
1620
|
+
import { mkdirSync as mkdirSync2, writeFileSync } from "fs";
|
|
1621
|
+
import { join as join3 } from "path";
|
|
1622
|
+
import { tmpdir } from "os";
|
|
1623
|
+
function startScreenshotServer(screenshotDir) {
|
|
1624
|
+
let counter = 0;
|
|
1625
|
+
const files = /* @__PURE__ */ new Map();
|
|
1626
|
+
return new Promise((resolve) => {
|
|
1627
|
+
const server2 = createServer((req, res) => {
|
|
1628
|
+
const name = req.url?.slice(1) ?? "";
|
|
1629
|
+
const buf = files.get(name);
|
|
1630
|
+
if (buf) {
|
|
1631
|
+
const ct = name.endsWith(".jpg") ? "image/jpeg" : "image/png";
|
|
1632
|
+
res.writeHead(200, { "Content-Type": ct });
|
|
1633
|
+
res.end(buf);
|
|
1634
|
+
} else {
|
|
1635
|
+
res.writeHead(200, { "Content-Type": "text/html" });
|
|
1636
|
+
const links = [...files.keys()].map((f) => `<a href="/${f}"><img src="/${f}" style="max-width:400px;margin:8px"></a>`).join("\n");
|
|
1637
|
+
res.end(`<html><body style="background:#1a1a1a;display:flex;flex-wrap:wrap">${links}</body></html>`);
|
|
1638
|
+
}
|
|
1639
|
+
});
|
|
1640
|
+
server2.listen(0, "127.0.0.1", () => {
|
|
1641
|
+
const addr = server2.address();
|
|
1642
|
+
const port = typeof addr === "object" && addr ? addr.port : 0;
|
|
1643
|
+
const save = (base64) => {
|
|
1644
|
+
counter++;
|
|
1645
|
+
const name = `screenshot-${counter}.jpg`;
|
|
1646
|
+
const buf = Buffer.from(base64, "base64");
|
|
1647
|
+
files.set(name, buf);
|
|
1648
|
+
const filePath = join3(screenshotDir, name);
|
|
1649
|
+
writeFileSync(filePath, buf);
|
|
1650
|
+
return `http://127.0.0.1:${port}/${name}`;
|
|
1651
|
+
};
|
|
1652
|
+
resolve({ port, save });
|
|
1653
|
+
});
|
|
1654
|
+
});
|
|
1655
|
+
}
|
|
1656
|
+
program.name("windows-use").description("Run Windows/browser automation tasks using a small LLM agent").version("0.2.0");
|
|
1657
|
+
program.command("init").description("Interactive setup \u2014 save config to ~/.windows-use.json").action(async () => {
|
|
1658
|
+
const rl = createInterface({ input: process.stdin, output: process.stdout });
|
|
1659
|
+
const ask = (q) => new Promise((resolve) => rl.question(q, (a) => resolve(a.trim())));
|
|
1660
|
+
console.log("\n\u{1F527} windows-use setup\n");
|
|
1661
|
+
const baseURL = await ask("Base URL (OpenAI-compatible endpoint): ");
|
|
1662
|
+
const apiKey = await ask("API Key: ");
|
|
1663
|
+
const model = await ask("Model name (e.g. qwen3.5-flash): ");
|
|
1664
|
+
rl.close();
|
|
1665
|
+
const config = {};
|
|
1666
|
+
if (baseURL) config.baseURL = baseURL;
|
|
1667
|
+
if (apiKey) config.apiKey = apiKey;
|
|
1668
|
+
if (model) config.model = model;
|
|
1669
|
+
const configPath = getConfigPath();
|
|
1670
|
+
writeFileSync(configPath, JSON.stringify(config, null, 2) + "\n", "utf-8");
|
|
1671
|
+
console.log(`
|
|
1672
|
+
\u2705 Config saved to ${configPath}`);
|
|
1673
|
+
console.log('You can now run: windows-use "your task here"\n');
|
|
1674
|
+
});
|
|
1675
|
+
program.argument("[instruction]", "The task to perform").option("--api-key <key>", "LLM API key").option("--base-url <url>", "OpenAI-compatible base URL").option("--model <name>", "Model name").option("--cdp-url <url>", "Chrome CDP URL (default: http://localhost:9222)").option("--max-steps <n>", "Max tool-calling steps per instruction", parseInt).option("--max-rounds <n>", "Max instruction rounds per session", parseInt).option("--mcp", "Start as MCP server instead of running a task").action(async (instruction, opts) => {
|
|
1676
|
+
if (opts.mcp) {
|
|
1218
1677
|
await init_server().then(() => server_exports);
|
|
1219
1678
|
return;
|
|
1220
1679
|
}
|
|
@@ -1225,28 +1684,102 @@ program.name("windows-use").description("Run Windows/browser automation tasks us
|
|
|
1225
1684
|
baseURL: opts.baseUrl,
|
|
1226
1685
|
model: opts.model,
|
|
1227
1686
|
cdpUrl: opts.cdpUrl,
|
|
1228
|
-
maxSteps: opts.maxSteps
|
|
1687
|
+
maxSteps: opts.maxSteps,
|
|
1688
|
+
maxRounds: opts.maxRounds
|
|
1229
1689
|
});
|
|
1230
1690
|
} catch (err) {
|
|
1231
1691
|
console.error(
|
|
1232
|
-
"Configuration error.
|
|
1692
|
+
"Configuration error. Run `windows-use init` to set up, or pass --api-key, --base-url, --model flags."
|
|
1233
1693
|
);
|
|
1234
1694
|
console.error(err instanceof Error ? err.message : err);
|
|
1235
1695
|
process.exit(1);
|
|
1236
1696
|
}
|
|
1697
|
+
const screenshotDir = join3(tmpdir(), "windows-use-screenshots");
|
|
1698
|
+
mkdirSync2(screenshotDir, { recursive: true });
|
|
1699
|
+
const { port, save: saveScreenshot } = await startScreenshotServer(screenshotDir);
|
|
1237
1700
|
const registry2 = new SessionRegistry();
|
|
1238
1701
|
const session = registry2.create(config);
|
|
1239
|
-
|
|
1240
|
-
|
|
1702
|
+
session.runner.setOnStep((event) => {
|
|
1703
|
+
const prefix = ` [step ${event.step}]`;
|
|
1704
|
+
switch (event.type) {
|
|
1705
|
+
case "thinking":
|
|
1706
|
+
console.log(`${prefix} \u{1F4AD} ${event.content}`);
|
|
1707
|
+
break;
|
|
1708
|
+
case "tool_call": {
|
|
1709
|
+
const argsStr = typeof event.args === "object" ? JSON.stringify(event.args, null, 0) : String(event.args);
|
|
1710
|
+
const preview = argsStr.length > 120 ? argsStr.slice(0, 120) + "..." : argsStr;
|
|
1711
|
+
console.log(`${prefix} \u{1F527} ${event.name}(${preview})`);
|
|
1712
|
+
break;
|
|
1713
|
+
}
|
|
1714
|
+
case "tool_result":
|
|
1715
|
+
console.log(`${prefix} \u2713 ${event.name} \u2192 ${event.result}`);
|
|
1716
|
+
break;
|
|
1717
|
+
case "error":
|
|
1718
|
+
console.log(`${prefix} \u2717 ${event.message}`);
|
|
1719
|
+
break;
|
|
1720
|
+
}
|
|
1721
|
+
});
|
|
1722
|
+
console.log(`
|
|
1723
|
+
[windows-use] Session ${session.id} created`);
|
|
1724
|
+
console.log(`[windows-use] Model: ${config.model}`);
|
|
1725
|
+
console.log(`[windows-use] Screenshots: http://127.0.0.1:${port}`);
|
|
1726
|
+
console.log(`[windows-use] Type "exit" or Ctrl+C to quit.
|
|
1727
|
+
`);
|
|
1728
|
+
const rl = createInterface({ input: process.stdin, output: process.stdout });
|
|
1729
|
+
const ask = (prompt) => new Promise((resolve) => rl.question(prompt, (a) => resolve(a.trim())));
|
|
1730
|
+
let nextInstruction = instruction ?? "";
|
|
1731
|
+
const printResult = (result) => {
|
|
1732
|
+
const statusIcon = result.status === "completed" ? "\u2705" : result.status === "blocked" ? "\u{1F6AB}" : "\u2753";
|
|
1733
|
+
console.log(`
|
|
1734
|
+
${statusIcon} [${result.status}]`);
|
|
1735
|
+
const blocks = parseReportContent(result.content, session.screenshots);
|
|
1736
|
+
for (const block of blocks) {
|
|
1737
|
+
if (block.type === "text") {
|
|
1738
|
+
process.stdout.write(block.text);
|
|
1739
|
+
} else {
|
|
1740
|
+
const url = saveScreenshot(block.base64);
|
|
1741
|
+
process.stdout.write(`
|
|
1742
|
+
\u{1F4F8} ${block.label}: ${url}
|
|
1743
|
+
`);
|
|
1744
|
+
}
|
|
1745
|
+
}
|
|
1746
|
+
if (result.data) {
|
|
1747
|
+
console.log(`
|
|
1748
|
+
Data: ${JSON.stringify(result.data)}`);
|
|
1749
|
+
}
|
|
1750
|
+
const roundInfo = `round ${session.runner.currentRound}/${config.maxRounds}`;
|
|
1751
|
+
console.log(`
|
|
1752
|
+
(${result.stepsUsed} steps, ${roundInfo})
|
|
1753
|
+
`);
|
|
1754
|
+
};
|
|
1241
1755
|
try {
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1756
|
+
while (true) {
|
|
1757
|
+
if (!nextInstruction) {
|
|
1758
|
+
nextInstruction = await ask("> ");
|
|
1759
|
+
} else {
|
|
1760
|
+
console.log(`> ${nextInstruction}`);
|
|
1761
|
+
}
|
|
1762
|
+
if (!nextInstruction || nextInstruction.toLowerCase() === "exit") {
|
|
1763
|
+
break;
|
|
1764
|
+
}
|
|
1765
|
+
if (session.runner.roundsExhausted) {
|
|
1766
|
+
console.log(`[windows-use] Session reached max rounds (${config.maxRounds}). Type "exit" to quit.
|
|
1767
|
+
`);
|
|
1768
|
+
nextInstruction = "";
|
|
1769
|
+
continue;
|
|
1770
|
+
}
|
|
1771
|
+
console.log("[windows-use] Running...\n");
|
|
1772
|
+
const result = await session.runner.run(nextInstruction);
|
|
1773
|
+
printResult(result);
|
|
1774
|
+
nextInstruction = "";
|
|
1775
|
+
}
|
|
1246
1776
|
} catch (err) {
|
|
1247
|
-
console.error("
|
|
1777
|
+
console.error("\nFatal error:", err instanceof Error ? err.message : err);
|
|
1778
|
+
} finally {
|
|
1779
|
+
rl.close();
|
|
1248
1780
|
await registry2.destroyAll();
|
|
1249
|
-
|
|
1781
|
+
console.log("[windows-use] Session ended.");
|
|
1782
|
+
process.exit(0);
|
|
1250
1783
|
}
|
|
1251
1784
|
});
|
|
1252
1785
|
program.parse();
|