windows-use 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js ADDED
@@ -0,0 +1,1253 @@
1
+ #!/usr/bin/env node
2
+ var __getOwnPropNames = Object.getOwnPropertyNames;
3
+ var __esm = (fn, res) => function __init() {
4
+ return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
5
+ };
6
+
7
+ // src/config/schema.ts
8
+ import { z } from "zod";
9
+ var ConfigSchema;
10
+ var init_schema = __esm({
11
+ "src/config/schema.ts"() {
12
+ "use strict";
13
+ ConfigSchema = z.object({
14
+ apiKey: z.string().min(1, "API key is required"),
15
+ baseURL: z.string().url("Must be a valid URL"),
16
+ model: z.string().min(1, "Model name is required"),
17
+ maxSteps: z.number().int().positive().default(50),
18
+ contextWindowSize: z.number().int().positive().default(20),
19
+ cdpUrl: z.string().default("http://localhost:9222"),
20
+ timeoutMs: z.number().default(3e5)
21
+ });
22
+ }
23
+ });
24
+
25
+ // src/config/loader.ts
26
+ function loadConfig(overrides) {
27
+ const raw = {
28
+ apiKey: overrides?.apiKey ?? process.env.WINDOWS_USE_API_KEY ?? "",
29
+ baseURL: overrides?.baseURL ?? process.env.WINDOWS_USE_BASE_URL ?? "",
30
+ model: overrides?.model ?? process.env.WINDOWS_USE_MODEL ?? "",
31
+ maxSteps: overrides?.maxSteps ?? intEnv("WINDOWS_USE_MAX_STEPS") ?? 50,
32
+ contextWindowSize: overrides?.contextWindowSize ?? intEnv("WINDOWS_USE_CONTEXT_WINDOW") ?? 20,
33
+ cdpUrl: overrides?.cdpUrl ?? process.env.WINDOWS_USE_CDP_URL ?? "http://localhost:9222",
34
+ timeoutMs: overrides?.timeoutMs ?? intEnv("WINDOWS_USE_TIMEOUT_MS") ?? 3e5
35
+ };
36
+ return ConfigSchema.parse(raw);
37
+ }
38
+ function intEnv(name) {
39
+ const val = process.env[name];
40
+ if (val === void 0) return void 0;
41
+ const n = parseInt(val, 10);
42
+ return isNaN(n) ? void 0 : n;
43
+ }
44
+ var init_loader = __esm({
45
+ "src/config/loader.ts"() {
46
+ "use strict";
47
+ init_schema();
48
+ }
49
+ });
50
+
51
+ // src/agent/context-manager.ts
52
+ var ContextManager;
53
+ var init_context_manager = __esm({
54
+ "src/agent/context-manager.ts"() {
55
+ "use strict";
56
+ ContextManager = class {
57
+ messages = [];
58
+ maxMessages;
59
+ constructor(maxMessages) {
60
+ this.maxMessages = maxMessages;
61
+ }
62
+ append(message) {
63
+ this.messages.push(message);
64
+ }
65
+ /** Returns the system prompt + the most recent messages within the window. */
66
+ getWindow() {
67
+ if (this.messages.length === 0) return [];
68
+ const systemPrompt = this.messages[0]?.role === "system" ? this.messages[0] : null;
69
+ const nonSystem = systemPrompt ? this.messages.slice(1) : this.messages;
70
+ const windowSize = this.maxMessages - (systemPrompt ? 1 : 0);
71
+ const windowed = nonSystem.length > windowSize ? nonSystem.slice(-windowSize) : nonSystem;
72
+ return systemPrompt ? [systemPrompt, ...windowed] : windowed;
73
+ }
74
+ /** Total messages stored (before windowing). */
75
+ get length() {
76
+ return this.messages.length;
77
+ }
78
+ };
79
+ }
80
+ });
81
+
82
+ // src/agent/llm-client.ts
83
+ import OpenAI from "openai";
84
+ var LLMClient;
85
+ var init_llm_client = __esm({
86
+ "src/agent/llm-client.ts"() {
87
+ "use strict";
88
+ LLMClient = class {
89
+ client;
90
+ model;
91
+ constructor(config) {
92
+ this.client = new OpenAI({
93
+ apiKey: config.apiKey,
94
+ baseURL: config.baseURL
95
+ });
96
+ this.model = config.model;
97
+ }
98
+ async chat(messages, tools) {
99
+ return this.client.chat.completions.create({
100
+ model: this.model,
101
+ messages,
102
+ tools: tools.length > 0 ? tools : void 0,
103
+ tool_choice: tools.length > 0 ? "auto" : void 0
104
+ });
105
+ }
106
+ };
107
+ }
108
+ });
109
+
110
+ // src/agent/system-prompt.ts
111
+ function buildSystemPrompt() {
112
+ return `You are a precise Windows and browser automation agent. Your job is to execute instructions by calling the tools available to you.
113
+
114
+ ## Workflow
115
+ 1. Take a screenshot first to understand the current state of the screen.
116
+ 2. Plan the minimal sequence of actions needed.
117
+ 3. Execute each action one at a time, then verify by taking another screenshot.
118
+ 4. When the task is done, you are blocked, or you need guidance, call \`report\` immediately.
119
+
120
+ ## Rules
121
+ - ALWAYS take a screenshot before your first action to understand the current state.
122
+ - After every mouse click or keyboard action, take a screenshot to verify the result.
123
+ - Call ONE tool at a time. Never request multiple tools in parallel.
124
+ - Before each tool call, briefly state what you are about to do and why.
125
+ - After receiving a tool result, describe what you observed.
126
+ - For browser tasks, prefer using browser_* tools over clicking on-screen coordinates.
127
+ - For terminal tasks, prefer \`run_command\` over GUI interactions when possible.
128
+ - Do not read or write files unless the instruction explicitly asks for it.
129
+
130
+ ## report Tool
131
+ Call \`report\` when:
132
+ - **"completed"**: The task is done successfully. Summarize what was accomplished.
133
+ - **"blocked"**: You cannot proceed (CAPTCHA, login wall, unexpected error). Explain what's blocking you.
134
+ - **"need_guidance"**: You need a decision or clarification. Describe what you need.
135
+
136
+ Calling \`report\` stops your execution. Include a concise summary and optionally a screenshot as evidence.
137
+
138
+ ## Important
139
+ - Do NOT keep retrying the same failing action. If something fails twice, call \`report\` with status "blocked".
140
+ - If a UI element is not where you expect it, try scrolling first before giving up.
141
+ - Keep your responses concise. Focus on actions, not explanations.`;
142
+ }
143
+ var init_system_prompt = __esm({
144
+ "src/agent/system-prompt.ts"() {
145
+ "use strict";
146
+ }
147
+ });
148
+
149
+ // src/agent/runner.ts
150
+ var AgentRunner;
151
+ var init_runner = __esm({
152
+ "src/agent/runner.ts"() {
153
+ "use strict";
154
+ init_system_prompt();
155
+ AgentRunner = class {
156
+ llmClient;
157
+ contextManager;
158
+ toolRegistry;
159
+ config;
160
+ toolContext;
161
+ initialized = false;
162
+ constructor(llmClient, contextManager, toolRegistry, config, toolContext) {
163
+ this.llmClient = llmClient;
164
+ this.contextManager = contextManager;
165
+ this.toolRegistry = toolRegistry;
166
+ this.config = config;
167
+ this.toolContext = toolContext;
168
+ }
169
+ async run(instruction) {
170
+ if (!this.initialized) {
171
+ this.contextManager.append({
172
+ role: "system",
173
+ content: buildSystemPrompt()
174
+ });
175
+ this.initialized = true;
176
+ }
177
+ this.contextManager.append({
178
+ role: "user",
179
+ content: instruction
180
+ });
181
+ let stepsUsed = 0;
182
+ while (stepsUsed < this.config.maxSteps) {
183
+ stepsUsed++;
184
+ const remaining = this.config.maxSteps - stepsUsed;
185
+ const messages = this.contextManager.getWindow();
186
+ if (remaining <= 3 && remaining >= 0) {
187
+ messages.push({
188
+ role: "system",
189
+ content: `\u26A0\uFE0F You have ${remaining} steps remaining. Call \`report\` NOW to summarize your progress. If you do not call report, your work will be lost.`
190
+ });
191
+ }
192
+ const tools = this.toolRegistry.toOpenAIFormat();
193
+ let response;
194
+ try {
195
+ response = await this.llmClient.chat(messages, tools);
196
+ } catch (err) {
197
+ const msg = err instanceof Error ? err.message : String(err);
198
+ return {
199
+ status: "blocked",
200
+ summary: `LLM API error: ${msg}`,
201
+ stepsUsed
202
+ };
203
+ }
204
+ const choice = response.choices[0];
205
+ if (!choice) {
206
+ return {
207
+ status: "blocked",
208
+ summary: "LLM returned empty response",
209
+ stepsUsed
210
+ };
211
+ }
212
+ const message = choice.message;
213
+ if (choice.finish_reason === "stop" || !message.tool_calls?.length) {
214
+ const text = message.content ?? "";
215
+ this.contextManager.append({ role: "assistant", content: text });
216
+ return {
217
+ status: "need_guidance",
218
+ summary: text || "Agent stopped without calling report.",
219
+ stepsUsed
220
+ };
221
+ }
222
+ this.contextManager.append(message);
223
+ for (const toolCall of message.tool_calls) {
224
+ let args;
225
+ try {
226
+ args = JSON.parse(toolCall.function.arguments);
227
+ } catch {
228
+ this.contextManager.append({
229
+ role: "tool",
230
+ tool_call_id: toolCall.id,
231
+ content: "Error: could not parse tool arguments as JSON"
232
+ });
233
+ continue;
234
+ }
235
+ let result;
236
+ try {
237
+ result = await this.toolRegistry.execute(
238
+ toolCall.function.name,
239
+ args,
240
+ this.toolContext
241
+ );
242
+ } catch (err) {
243
+ const msg = err instanceof Error ? err.message : String(err);
244
+ this.contextManager.append({
245
+ role: "tool",
246
+ tool_call_id: toolCall.id,
247
+ content: `Error executing tool: ${msg}`
248
+ });
249
+ continue;
250
+ }
251
+ if (result.type === "report") {
252
+ this.contextManager.append({
253
+ role: "tool",
254
+ tool_call_id: toolCall.id,
255
+ content: "Report submitted. Returning control to caller."
256
+ });
257
+ return {
258
+ status: result.status,
259
+ summary: result.summary,
260
+ screenshot: result.screenshot,
261
+ data: result.data,
262
+ stepsUsed
263
+ };
264
+ }
265
+ if (result.type === "image") {
266
+ this.contextManager.append({
267
+ role: "tool",
268
+ tool_call_id: toolCall.id,
269
+ content: [
270
+ { type: "text", text: "Screenshot captured." },
271
+ {
272
+ type: "image_url",
273
+ image_url: {
274
+ url: `data:${result.mimeType};base64,${result.base64}`
275
+ }
276
+ }
277
+ ]
278
+ });
279
+ } else {
280
+ this.contextManager.append({
281
+ role: "tool",
282
+ tool_call_id: toolCall.id,
283
+ content: result.content
284
+ });
285
+ }
286
+ }
287
+ }
288
+ return {
289
+ status: "blocked",
290
+ summary: `Exceeded maximum steps limit (${this.config.maxSteps}). Task may be incomplete.`,
291
+ stepsUsed
292
+ };
293
+ }
294
+ };
295
+ }
296
+ });
297
+
298
+ // src/tools/browser/client.ts
299
+ var BrowserClient;
300
+ var init_client = __esm({
301
+ "src/tools/browser/client.ts"() {
302
+ "use strict";
303
+ BrowserClient = class {
304
+ browser = null;
305
+ context = null;
306
+ _page = null;
307
+ cdpUrl;
308
+ constructor(cdpUrl) {
309
+ this.cdpUrl = cdpUrl;
310
+ }
311
+ async connect() {
312
+ if (this.browser) return;
313
+ const { chromium } = await import("playwright");
314
+ this.browser = await chromium.connectOverCDP(this.cdpUrl);
315
+ const contexts = this.browser.contexts();
316
+ this.context = contexts[0] ?? await this.browser.newContext();
317
+ const pages = this.context.pages();
318
+ this._page = pages[0] ?? await this.context.newPage();
319
+ }
320
+ async getPage() {
321
+ await this.connect();
322
+ return this._page;
323
+ }
324
+ /** Create a new tab and switch to it. */
325
+ async newPage() {
326
+ await this.connect();
327
+ this._page = await this.context.newPage();
328
+ return this._page;
329
+ }
330
+ async close() {
331
+ if (this.browser) {
332
+ await this.browser.close().catch(() => {
333
+ });
334
+ this.browser = null;
335
+ this.context = null;
336
+ this._page = null;
337
+ }
338
+ }
339
+ get connected() {
340
+ return this.browser !== null && this.browser.isConnected();
341
+ }
342
+ };
343
+ }
344
+ });
345
+
346
+ // src/tools/zod-to-json.ts
347
+ function zodToJsonSchema(schema) {
348
+ return convertZodType(schema);
349
+ }
350
+ function convertZodType(schema) {
351
+ const def = schema._def;
352
+ const typeName = def?.typeName;
353
+ switch (typeName) {
354
+ case "ZodObject": {
355
+ const shape = schema.shape;
356
+ const properties = {};
357
+ const required = [];
358
+ for (const [key, value] of Object.entries(shape)) {
359
+ properties[key] = convertZodType(value);
360
+ if (!isOptional(value)) {
361
+ required.push(key);
362
+ }
363
+ }
364
+ return {
365
+ type: "object",
366
+ properties,
367
+ ...required.length > 0 ? { required } : {}
368
+ };
369
+ }
370
+ case "ZodString": {
371
+ const result = { type: "string" };
372
+ if (def.description) result.description = def.description;
373
+ return result;
374
+ }
375
+ case "ZodNumber": {
376
+ const result = { type: "number" };
377
+ if (def.description) result.description = def.description;
378
+ return result;
379
+ }
380
+ case "ZodBoolean": {
381
+ const result = { type: "boolean" };
382
+ if (def.description) result.description = def.description;
383
+ return result;
384
+ }
385
+ case "ZodEnum": {
386
+ const result = {
387
+ type: "string",
388
+ enum: def.values
389
+ };
390
+ if (def.description) result.description = def.description;
391
+ return result;
392
+ }
393
+ case "ZodArray": {
394
+ const result = {
395
+ type: "array",
396
+ items: convertZodType(def.type)
397
+ };
398
+ if (def.description) result.description = def.description;
399
+ return result;
400
+ }
401
+ case "ZodOptional":
402
+ return convertZodType(def.innerType);
403
+ case "ZodDefault":
404
+ return convertZodType(def.innerType);
405
+ case "ZodEffects":
406
+ return convertZodType(def.schema);
407
+ case "ZodUnknown":
408
+ return {};
409
+ default:
410
+ return { type: "string" };
411
+ }
412
+ }
413
+ function isOptional(schema) {
414
+ const def = schema._def;
415
+ const typeName = def?.typeName;
416
+ return typeName === "ZodOptional" || typeName === "ZodDefault";
417
+ }
418
+ var init_zod_to_json = __esm({
419
+ "src/tools/zod-to-json.ts"() {
420
+ "use strict";
421
+ }
422
+ });
423
+
424
+ // src/tools/registry.ts
425
+ var ToolRegistry;
426
+ var init_registry = __esm({
427
+ "src/tools/registry.ts"() {
428
+ "use strict";
429
+ init_zod_to_json();
430
+ ToolRegistry = class {
431
+ tools = /* @__PURE__ */ new Map();
432
+ register(tool) {
433
+ this.tools.set(tool.name, tool);
434
+ }
435
+ get(name) {
436
+ return this.tools.get(name);
437
+ }
438
+ toOpenAIFormat() {
439
+ return Array.from(this.tools.values()).map((tool) => ({
440
+ type: "function",
441
+ function: {
442
+ name: tool.name,
443
+ description: tool.description,
444
+ parameters: zodToJsonSchema(tool.parameters)
445
+ }
446
+ }));
447
+ }
448
+ async execute(name, args, context) {
449
+ const tool = this.tools.get(name);
450
+ if (!tool) {
451
+ return { type: "text", content: `Error: unknown tool "${name}"` };
452
+ }
453
+ const parsed = tool.parameters.safeParse(args);
454
+ if (!parsed.success) {
455
+ return {
456
+ type: "text",
457
+ content: `Error: invalid arguments for "${name}": ${parsed.error.message}`
458
+ };
459
+ }
460
+ return tool.execute(parsed.data, context);
461
+ }
462
+ };
463
+ }
464
+ });
465
+
466
+ // src/tools/windows/screenshot.ts
467
+ import { z as z2 } from "zod";
468
+ var screenshotTool;
469
+ var init_screenshot = __esm({
470
+ "src/tools/windows/screenshot.ts"() {
471
+ "use strict";
472
+ screenshotTool = {
473
+ name: "screenshot",
474
+ description: "Capture the full screen and return it as an image. Use this to see what is currently displayed.",
475
+ parameters: z2.object({}),
476
+ async execute() {
477
+ const { Monitor } = await import("node-screenshots");
478
+ const monitors = Monitor.all();
479
+ const primary = monitors.find((m) => m.isPrimary()) ?? monitors[0];
480
+ if (!primary) {
481
+ return { type: "text", content: "Error: No monitor found" };
482
+ }
483
+ const image = primary.captureImageSync();
484
+ const buf = image.toPngSync();
485
+ return {
486
+ type: "image",
487
+ base64: buf.toString("base64"),
488
+ mimeType: "image/png"
489
+ };
490
+ }
491
+ };
492
+ }
493
+ });
494
+
495
+ // src/tools/windows/mouse.ts
496
+ import { z as z3 } from "zod";
497
+ async function getNutJs() {
498
+ return import("@nut-tree-fork/nut-js");
499
+ }
500
+ var mouseClickTool, mouseMoveTool, mouseScrollTool;
501
+ var init_mouse = __esm({
502
+ "src/tools/windows/mouse.ts"() {
503
+ "use strict";
504
+ mouseClickTool = {
505
+ name: "mouse_click",
506
+ description: "Click the mouse at the given screen coordinates.",
507
+ parameters: z3.object({
508
+ x: z3.number().describe("X coordinate on screen"),
509
+ y: z3.number().describe("Y coordinate on screen"),
510
+ button: z3.enum(["left", "right", "middle"]).default("left").describe("Mouse button")
511
+ }),
512
+ async execute(args) {
513
+ const nut = await getNutJs();
514
+ const point = new nut.Point(args.x, args.y);
515
+ await nut.mouse.move(nut.straightTo(point));
516
+ const buttonMap = {
517
+ left: nut.Button.LEFT,
518
+ right: nut.Button.RIGHT,
519
+ middle: nut.Button.MIDDLE
520
+ };
521
+ await nut.mouse.click(buttonMap[args.button]);
522
+ return { type: "text", content: `Clicked ${args.button} at (${args.x}, ${args.y})` };
523
+ }
524
+ };
525
+ mouseMoveTool = {
526
+ name: "mouse_move",
527
+ description: "Move the mouse to the given screen coordinates without clicking.",
528
+ parameters: z3.object({
529
+ x: z3.number().describe("X coordinate on screen"),
530
+ y: z3.number().describe("Y coordinate on screen")
531
+ }),
532
+ async execute(args) {
533
+ const nut = await getNutJs();
534
+ const point = new nut.Point(args.x, args.y);
535
+ await nut.mouse.move(nut.straightTo(point));
536
+ return { type: "text", content: `Mouse moved to (${args.x}, ${args.y})` };
537
+ }
538
+ };
539
+ mouseScrollTool = {
540
+ name: "mouse_scroll",
541
+ description: "Scroll the mouse wheel.",
542
+ parameters: z3.object({
543
+ direction: z3.enum(["up", "down"]).describe("Scroll direction"),
544
+ amount: z3.number().positive().default(3).describe("Number of scroll steps")
545
+ }),
546
+ async execute(args) {
547
+ const nut = await getNutJs();
548
+ for (let i = 0; i < args.amount; i++) {
549
+ if (args.direction === "down") {
550
+ await nut.mouse.scrollDown(1);
551
+ } else {
552
+ await nut.mouse.scrollUp(1);
553
+ }
554
+ }
555
+ return { type: "text", content: `Scrolled ${args.direction} ${args.amount} steps` };
556
+ }
557
+ };
558
+ }
559
+ });
560
+
561
+ // src/tools/windows/keyboard.ts
562
+ import { z as z4 } from "zod";
563
+ async function getNutJs2() {
564
+ return import("@nut-tree-fork/nut-js");
565
+ }
566
+ function resolveKey(nut, keyName) {
567
+ const keyMap = {
568
+ "ctrl": "LeftControl",
569
+ "control": "LeftControl",
570
+ "shift": "LeftShift",
571
+ "alt": "LeftAlt",
572
+ "meta": "LeftWin",
573
+ "win": "LeftWin",
574
+ "windows": "LeftWin",
575
+ "enter": "Return",
576
+ "return": "Return",
577
+ "tab": "Tab",
578
+ "escape": "Escape",
579
+ "esc": "Escape",
580
+ "backspace": "Backspace",
581
+ "delete": "Delete",
582
+ "space": "Space",
583
+ "up": "Up",
584
+ "down": "Down",
585
+ "left": "Left",
586
+ "right": "Right",
587
+ "home": "Home",
588
+ "end": "End",
589
+ "pageup": "PageUp",
590
+ "pagedown": "PageDown",
591
+ "f1": "F1",
592
+ "f2": "F2",
593
+ "f3": "F3",
594
+ "f4": "F4",
595
+ "f5": "F5",
596
+ "f6": "F6",
597
+ "f7": "F7",
598
+ "f8": "F8",
599
+ "f9": "F9",
600
+ "f10": "F10",
601
+ "f11": "F11",
602
+ "f12": "F12"
603
+ };
604
+ const normalized = keyName.toLowerCase().trim();
605
+ const mapped = keyMap[normalized] ?? keyName;
606
+ const key = nut.Key[mapped];
607
+ if (key !== void 0) return key;
608
+ if (mapped.length === 1) {
609
+ const upper = mapped.toUpperCase();
610
+ const k = nut.Key[upper];
611
+ if (k !== void 0) return k;
612
+ }
613
+ throw new Error(`Unknown key: "${keyName}"`);
614
+ }
615
+ var keyboardTypeTool, keyboardPressTool;
616
+ var init_keyboard = __esm({
617
+ "src/tools/windows/keyboard.ts"() {
618
+ "use strict";
619
+ keyboardTypeTool = {
620
+ name: "keyboard_type",
621
+ description: "Type text using the keyboard. The text is typed character by character.",
622
+ parameters: z4.object({
623
+ text: z4.string().describe("The text to type")
624
+ }),
625
+ async execute(args) {
626
+ const nut = await getNutJs2();
627
+ await nut.keyboard.type(args.text);
628
+ return { type: "text", content: `Typed: "${args.text}"` };
629
+ }
630
+ };
631
+ keyboardPressTool = {
632
+ name: "keyboard_press",
633
+ description: 'Press a key combination. Examples: ["Ctrl", "C"] for copy, ["Enter"] for enter, ["Alt", "F4"] to close window.',
634
+ parameters: z4.object({
635
+ keys: z4.array(z4.string()).min(1).describe("Array of key names to press simultaneously")
636
+ }),
637
+ async execute(args) {
638
+ const nut = await getNutJs2();
639
+ const resolved = args.keys.map((k) => resolveKey(nut, k));
640
+ await nut.keyboard.pressKey(...resolved);
641
+ await nut.keyboard.releaseKey(...resolved);
642
+ return { type: "text", content: `Pressed: ${args.keys.join("+")}` };
643
+ }
644
+ };
645
+ }
646
+ });
647
+
648
+ // src/tools/windows/command.ts
649
+ import { z as z5 } from "zod";
650
+ import { exec } from "child_process";
651
+ var MAX_OUTPUT_LENGTH, runCommandTool;
652
+ var init_command = __esm({
653
+ "src/tools/windows/command.ts"() {
654
+ "use strict";
655
+ MAX_OUTPUT_LENGTH = 1e4;
656
+ runCommandTool = {
657
+ name: "run_command",
658
+ description: "Execute a shell command and return its output. Uses PowerShell on Windows.",
659
+ parameters: z5.object({
660
+ command: z5.string().describe("The command to execute"),
661
+ timeout: z5.number().positive().default(3e4).describe("Timeout in milliseconds")
662
+ }),
663
+ async execute(args) {
664
+ return new Promise((resolve) => {
665
+ exec(
666
+ args.command,
667
+ {
668
+ timeout: args.timeout,
669
+ maxBuffer: 1024 * 1024,
670
+ shell: "powershell.exe",
671
+ windowsHide: true
672
+ },
673
+ (error, stdout, stderr) => {
674
+ let output = "";
675
+ if (stdout) output += stdout;
676
+ if (stderr) output += `
677
+ [stderr] ${stderr}`;
678
+ if (error && error.killed) output += "\n[timeout] Command timed out";
679
+ else if (error) output += `
680
+ [exit code ${error.code}]`;
681
+ if (output.length > MAX_OUTPUT_LENGTH) {
682
+ output = output.slice(0, MAX_OUTPUT_LENGTH) + "\n...(truncated)";
683
+ }
684
+ resolve({ type: "text", content: output.trim() || "(no output)" });
685
+ }
686
+ );
687
+ });
688
+ }
689
+ };
690
+ }
691
+ });
692
+
693
+ // src/tools/file/read.ts
694
+ import { z as z6 } from "zod";
695
+ import { readFile } from "fs/promises";
696
+ var MAX_FILE_SIZE, fileReadTool;
697
+ var init_read = __esm({
698
+ "src/tools/file/read.ts"() {
699
+ "use strict";
700
+ MAX_FILE_SIZE = 1e5;
701
+ fileReadTool = {
702
+ name: "file_read",
703
+ description: "Read the contents of a file at the given path.",
704
+ parameters: z6.object({
705
+ path: z6.string().describe("Absolute path to the file")
706
+ }),
707
+ async execute(args) {
708
+ try {
709
+ const content = await readFile(args.path, "utf-8");
710
+ if (content.length > MAX_FILE_SIZE) {
711
+ return {
712
+ type: "text",
713
+ content: content.slice(0, MAX_FILE_SIZE) + "\n...(truncated)"
714
+ };
715
+ }
716
+ return { type: "text", content };
717
+ } catch (err) {
718
+ const msg = err instanceof Error ? err.message : String(err);
719
+ return { type: "text", content: `Error reading file: ${msg}` };
720
+ }
721
+ }
722
+ };
723
+ }
724
+ });
725
+
726
+ // src/tools/file/write.ts
727
+ import { z as z7 } from "zod";
728
+ import { writeFile, mkdir } from "fs/promises";
729
+ import { dirname } from "path";
730
+ var fileWriteTool;
731
+ var init_write = __esm({
732
+ "src/tools/file/write.ts"() {
733
+ "use strict";
734
+ fileWriteTool = {
735
+ name: "file_write",
736
+ description: "Write content to a file at the given path. Creates parent directories if needed.",
737
+ parameters: z7.object({
738
+ path: z7.string().describe("Absolute path to the file"),
739
+ content: z7.string().describe("Content to write")
740
+ }),
741
+ async execute(args) {
742
+ try {
743
+ await mkdir(dirname(args.path), { recursive: true });
744
+ await writeFile(args.path, args.content, "utf-8");
745
+ return { type: "text", content: `File written: ${args.path}` };
746
+ } catch (err) {
747
+ const msg = err instanceof Error ? err.message : String(err);
748
+ return { type: "text", content: `Error writing file: ${msg}` };
749
+ }
750
+ }
751
+ };
752
+ }
753
+ });
754
+
755
+ // src/tools/browser/navigate.ts
756
+ import { z as z8 } from "zod";
757
+ var browserNavigateTool;
758
+ var init_navigate = __esm({
759
+ "src/tools/browser/navigate.ts"() {
760
+ "use strict";
761
+ browserNavigateTool = {
762
+ name: "browser_navigate",
763
+ description: "Navigate the browser to a URL.",
764
+ parameters: z8.object({
765
+ url: z8.string().describe("The URL to navigate to")
766
+ }),
767
+ async execute(args, ctx) {
768
+ const browser = await ctx.getBrowser();
769
+ const page = await browser.getPage();
770
+ await page.goto(args.url, { waitUntil: "domcontentloaded", timeout: 3e4 });
771
+ const title = await page.title();
772
+ return { type: "text", content: `Navigated to: ${args.url}
773
+ Page title: ${title}` };
774
+ }
775
+ };
776
+ }
777
+ });
778
+
779
+ // src/tools/browser/click.ts
780
+ import { z as z9 } from "zod";
781
+ var browserClickTool;
782
+ var init_click = __esm({
783
+ "src/tools/browser/click.ts"() {
784
+ "use strict";
785
+ browserClickTool = {
786
+ name: "browser_click",
787
+ description: "Click an element on the web page using a CSS selector or text content.",
788
+ parameters: z9.object({
789
+ selector: z9.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
790
+ }),
791
+ async execute(args, ctx) {
792
+ const browser = await ctx.getBrowser();
793
+ const page = await browser.getPage();
794
+ await page.click(args.selector, { timeout: 1e4 });
795
+ return { type: "text", content: `Clicked element: ${args.selector}` };
796
+ }
797
+ };
798
+ }
799
+ });
800
+
801
+ // src/tools/browser/type.ts
802
+ import { z as z10 } from "zod";
803
+ var browserTypeTool;
804
+ var init_type = __esm({
805
+ "src/tools/browser/type.ts"() {
806
+ "use strict";
807
+ browserTypeTool = {
808
+ name: "browser_type",
809
+ description: "Type text into an input field on the web page.",
810
+ parameters: z10.object({
811
+ selector: z10.string().describe("CSS selector for the input element"),
812
+ text: z10.string().describe("Text to type"),
813
+ clear: z10.boolean().default(true).describe("Whether to clear the field before typing")
814
+ }),
815
+ async execute(args, ctx) {
816
+ const browser = await ctx.getBrowser();
817
+ const page = await browser.getPage();
818
+ if (args.clear) {
819
+ await page.fill(args.selector, args.text, { timeout: 1e4 });
820
+ } else {
821
+ await page.type(args.selector, args.text, { timeout: 1e4 });
822
+ }
823
+ return { type: "text", content: `Typed "${args.text}" into ${args.selector}` };
824
+ }
825
+ };
826
+ }
827
+ });
828
+
829
+ // src/tools/browser/screenshot.ts
830
+ import { z as z11 } from "zod";
831
+ var browserScreenshotTool;
832
+ var init_screenshot2 = __esm({
833
+ "src/tools/browser/screenshot.ts"() {
834
+ "use strict";
835
+ browserScreenshotTool = {
836
+ name: "browser_screenshot",
837
+ description: "Take a screenshot of the current browser page.",
838
+ parameters: z11.object({
839
+ fullPage: z11.boolean().default(false).describe("Whether to capture the full scrollable page")
840
+ }),
841
+ async execute(args, ctx) {
842
+ const browser = await ctx.getBrowser();
843
+ const page = await browser.getPage();
844
+ const buf = await page.screenshot({
845
+ type: "png",
846
+ fullPage: args.fullPage
847
+ });
848
+ return {
849
+ type: "image",
850
+ base64: buf.toString("base64"),
851
+ mimeType: "image/png"
852
+ };
853
+ }
854
+ };
855
+ }
856
+ });
857
+
858
+ // src/tools/browser/content.ts
859
+ import { z as z12 } from "zod";
860
+ var MAX_CONTENT_LENGTH, browserContentTool;
861
+ var init_content = __esm({
862
+ "src/tools/browser/content.ts"() {
863
+ "use strict";
864
+ MAX_CONTENT_LENGTH = 2e4;
865
+ browserContentTool = {
866
+ name: "browser_content",
867
+ description: "Get the text content of the current web page. Returns visible text, not HTML.",
868
+ parameters: z12.object({}),
869
+ async execute(_args, ctx) {
870
+ const browser = await ctx.getBrowser();
871
+ const page = await browser.getPage();
872
+ const url = page.url();
873
+ const title = await page.title();
874
+ let text = await page.innerText("body").catch(() => "");
875
+ if (text.length > MAX_CONTENT_LENGTH) {
876
+ text = text.slice(0, MAX_CONTENT_LENGTH) + "\n...(truncated)";
877
+ }
878
+ return {
879
+ type: "text",
880
+ content: `URL: ${url}
881
+ Title: ${title}
882
+
883
+ ${text}`
884
+ };
885
+ }
886
+ };
887
+ }
888
+ });
889
+
890
+ // src/tools/browser/scroll.ts
891
+ import { z as z13 } from "zod";
892
+ var browserScrollTool;
893
+ var init_scroll = __esm({
894
+ "src/tools/browser/scroll.ts"() {
895
+ "use strict";
896
+ browserScrollTool = {
897
+ name: "browser_scroll",
898
+ description: "Scroll the current web page.",
899
+ parameters: z13.object({
900
+ direction: z13.enum(["up", "down"]).describe("Scroll direction"),
901
+ amount: z13.number().positive().default(500).describe("Pixels to scroll")
902
+ }),
903
+ async execute(args, ctx) {
904
+ const browser = await ctx.getBrowser();
905
+ const page = await browser.getPage();
906
+ const delta = args.direction === "down" ? args.amount : -args.amount;
907
+ await page.evaluate((d) => window.scrollBy(0, d), delta);
908
+ return { type: "text", content: `Scrolled ${args.direction} ${args.amount}px` };
909
+ }
910
+ };
911
+ }
912
+ });
913
+
914
+ // src/tools/control/report.ts
915
+ import { z as z14 } from "zod";
916
+ var reportTool;
917
+ var init_report = __esm({
918
+ "src/tools/control/report.ts"() {
919
+ "use strict";
920
+ reportTool = {
921
+ name: "report",
922
+ description: "Report progress back to the caller. Call this when the task is completed, when you are blocked, or when you need guidance. Calling this STOPS your execution immediately.",
923
+ parameters: z14.object({
924
+ status: z14.enum(["completed", "blocked", "need_guidance"]).describe(
925
+ '"completed" = task done, "blocked" = cannot proceed, "need_guidance" = need a decision'
926
+ ),
927
+ summary: z14.string().describe("Concise human-readable summary of what was accomplished or what the problem is"),
928
+ include_screenshot: z14.boolean().default(false).describe("Whether to capture and include a screenshot of the current state"),
929
+ data: z14.unknown().optional().describe("Optional structured data to return")
930
+ }),
931
+ async execute(args) {
932
+ let screenshot;
933
+ if (args.include_screenshot) {
934
+ try {
935
+ const { Monitor } = await import("node-screenshots");
936
+ const monitors = Monitor.all();
937
+ const primary = monitors.find((m) => m.isPrimary()) ?? monitors[0];
938
+ if (primary) {
939
+ const image = primary.captureImageSync();
940
+ const buf = image.toPngSync();
941
+ screenshot = buf.toString("base64");
942
+ }
943
+ } catch {
944
+ }
945
+ }
946
+ return {
947
+ type: "report",
948
+ status: args.status,
949
+ summary: args.summary,
950
+ screenshot,
951
+ data: args.data
952
+ };
953
+ }
954
+ };
955
+ }
956
+ });
957
+
958
+ // src/tools/index.ts
959
+ function createToolRegistry() {
960
+ const registry2 = new ToolRegistry();
961
+ registry2.register(screenshotTool);
962
+ registry2.register(mouseClickTool);
963
+ registry2.register(mouseMoveTool);
964
+ registry2.register(mouseScrollTool);
965
+ registry2.register(keyboardTypeTool);
966
+ registry2.register(keyboardPressTool);
967
+ registry2.register(runCommandTool);
968
+ registry2.register(fileReadTool);
969
+ registry2.register(fileWriteTool);
970
+ registry2.register(browserNavigateTool);
971
+ registry2.register(browserClickTool);
972
+ registry2.register(browserTypeTool);
973
+ registry2.register(browserScreenshotTool);
974
+ registry2.register(browserContentTool);
975
+ registry2.register(browserScrollTool);
976
+ registry2.register(reportTool);
977
+ return registry2;
978
+ }
979
+ var init_tools = __esm({
980
+ "src/tools/index.ts"() {
981
+ "use strict";
982
+ init_registry();
983
+ init_screenshot();
984
+ init_mouse();
985
+ init_keyboard();
986
+ init_command();
987
+ init_read();
988
+ init_write();
989
+ init_navigate();
990
+ init_click();
991
+ init_type();
992
+ init_screenshot2();
993
+ init_content();
994
+ init_scroll();
995
+ init_report();
996
+ }
997
+ });
998
+
999
+ // src/mcp/session-registry.ts
1000
+ import crypto from "crypto";
1001
+ var SessionRegistry;
1002
+ var init_session_registry = __esm({
1003
+ "src/mcp/session-registry.ts"() {
1004
+ "use strict";
1005
+ init_context_manager();
1006
+ init_llm_client();
1007
+ init_runner();
1008
+ init_client();
1009
+ init_tools();
1010
+ SessionRegistry = class {
1011
+ sessions = /* @__PURE__ */ new Map();
1012
+ create(config) {
1013
+ const id = crypto.randomUUID();
1014
+ const contextManager = new ContextManager(config.contextWindowSize);
1015
+ const llmClient = new LLMClient(config);
1016
+ const browserClient = new BrowserClient(config.cdpUrl);
1017
+ const toolRegistry = createToolRegistry();
1018
+ const toolContext = {
1019
+ sessionId: id,
1020
+ cdpUrl: config.cdpUrl,
1021
+ getBrowser: () => {
1022
+ return browserClient.connect().then(() => browserClient);
1023
+ }
1024
+ };
1025
+ const runner = new AgentRunner(
1026
+ llmClient,
1027
+ contextManager,
1028
+ toolRegistry,
1029
+ config,
1030
+ toolContext
1031
+ );
1032
+ const timeoutHandle = setTimeout(
1033
+ () => this.destroy(id),
1034
+ config.timeoutMs
1035
+ );
1036
+ const session = {
1037
+ id,
1038
+ createdAt: /* @__PURE__ */ new Date(),
1039
+ lastActivityAt: /* @__PURE__ */ new Date(),
1040
+ config,
1041
+ runner,
1042
+ browserClient,
1043
+ timeoutHandle
1044
+ };
1045
+ this.sessions.set(id, session);
1046
+ return session;
1047
+ }
1048
+ get(id) {
1049
+ return this.sessions.get(id);
1050
+ }
1051
+ touch(id) {
1052
+ const session = this.sessions.get(id);
1053
+ if (!session) return;
1054
+ session.lastActivityAt = /* @__PURE__ */ new Date();
1055
+ clearTimeout(session.timeoutHandle);
1056
+ session.timeoutHandle = setTimeout(
1057
+ () => this.destroy(id),
1058
+ session.config.timeoutMs
1059
+ );
1060
+ }
1061
+ async destroy(id) {
1062
+ const session = this.sessions.get(id);
1063
+ if (!session) return;
1064
+ clearTimeout(session.timeoutHandle);
1065
+ await session.browserClient.close().catch(() => {
1066
+ });
1067
+ this.sessions.delete(id);
1068
+ }
1069
+ async destroyAll() {
1070
+ await Promise.allSettled(
1071
+ [...this.sessions.keys()].map((id) => this.destroy(id))
1072
+ );
1073
+ }
1074
+ };
1075
+ }
1076
+ });
1077
+
1078
+ // src/mcp/tools.ts
1079
+ import { z as z15 } from "zod";
1080
+ function registerMcpTools(server2, registry2) {
1081
+ server2.tool(
1082
+ "create_session",
1083
+ "Create a new automation session with a small LLM agent. Returns a session_id.",
1084
+ {
1085
+ api_key: z15.string().optional().describe("LLM API key (or set WINDOWS_USE_API_KEY env)"),
1086
+ base_url: z15.string().optional().describe("OpenAI-compatible base URL (or set WINDOWS_USE_BASE_URL env)"),
1087
+ model: z15.string().optional().describe("Model name (or set WINDOWS_USE_MODEL env)"),
1088
+ cdp_url: z15.string().optional().describe("Chrome CDP URL (default: http://localhost:9222)"),
1089
+ timeout_ms: z15.number().optional().describe("Session inactivity timeout in ms (default: 300000)"),
1090
+ max_steps: z15.number().optional().describe("Max tool-calling steps per instruction (default: 50)")
1091
+ },
1092
+ async (args) => {
1093
+ const config = loadConfig({
1094
+ apiKey: args.api_key,
1095
+ baseURL: args.base_url,
1096
+ model: args.model,
1097
+ cdpUrl: args.cdp_url,
1098
+ timeoutMs: args.timeout_ms,
1099
+ maxSteps: args.max_steps
1100
+ });
1101
+ const session = registry2.create(config);
1102
+ return {
1103
+ content: [
1104
+ {
1105
+ type: "text",
1106
+ text: JSON.stringify({ session_id: session.id })
1107
+ }
1108
+ ]
1109
+ };
1110
+ }
1111
+ );
1112
+ server2.tool(
1113
+ "send_instruction",
1114
+ "Send a task instruction to the agent in a session. The agent executes it and returns a status report.",
1115
+ {
1116
+ session_id: z15.string().describe("Session ID from create_session"),
1117
+ instruction: z15.string().describe("What you want the agent to do, in natural language")
1118
+ },
1119
+ async (args) => {
1120
+ const session = registry2.get(args.session_id);
1121
+ if (!session) {
1122
+ return {
1123
+ content: [
1124
+ {
1125
+ type: "text",
1126
+ text: JSON.stringify({
1127
+ error: `Session "${args.session_id}" not found or expired`
1128
+ })
1129
+ }
1130
+ ],
1131
+ isError: true
1132
+ };
1133
+ }
1134
+ registry2.touch(args.session_id);
1135
+ const result = await session.runner.run(args.instruction);
1136
+ registry2.touch(args.session_id);
1137
+ const content = [
1138
+ {
1139
+ type: "text",
1140
+ text: JSON.stringify({
1141
+ status: result.status,
1142
+ summary: result.summary,
1143
+ steps_used: result.stepsUsed,
1144
+ ...result.data !== void 0 ? { data: result.data } : {}
1145
+ })
1146
+ }
1147
+ ];
1148
+ if (result.screenshot) {
1149
+ content.push({
1150
+ type: "image",
1151
+ data: result.screenshot,
1152
+ mimeType: "image/png"
1153
+ });
1154
+ }
1155
+ return { content };
1156
+ }
1157
+ );
1158
+ server2.tool(
1159
+ "done_session",
1160
+ "Terminate a session and free all resources.",
1161
+ {
1162
+ session_id: z15.string().describe("Session ID to terminate")
1163
+ },
1164
+ async (args) => {
1165
+ await registry2.destroy(args.session_id);
1166
+ return {
1167
+ content: [
1168
+ {
1169
+ type: "text",
1170
+ text: JSON.stringify({ success: true })
1171
+ }
1172
+ ]
1173
+ };
1174
+ }
1175
+ );
1176
+ }
1177
+ var init_tools2 = __esm({
1178
+ "src/mcp/tools.ts"() {
1179
+ "use strict";
1180
+ init_loader();
1181
+ }
1182
+ });
1183
+
1184
+ // src/mcp/server.ts
1185
+ var server_exports = {};
1186
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
1187
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
1188
+ async function shutdown() {
1189
+ await registry.destroyAll();
1190
+ process.exit(0);
1191
+ }
1192
+ var server, registry, transport;
1193
+ var init_server = __esm({
1194
+ async "src/mcp/server.ts"() {
1195
+ "use strict";
1196
+ init_session_registry();
1197
+ init_tools2();
1198
+ server = new McpServer({
1199
+ name: "windows-use",
1200
+ version: "0.1.0"
1201
+ });
1202
+ registry = new SessionRegistry();
1203
+ registerMcpTools(server, registry);
1204
+ process.on("SIGINT", shutdown);
1205
+ process.on("SIGTERM", shutdown);
1206
+ transport = new StdioServerTransport();
1207
+ await server.connect(transport);
1208
+ console.error("[windows-use] MCP server started");
1209
+ }
1210
+ });
1211
+
1212
+ // src/cli.ts
1213
+ init_loader();
1214
+ init_session_registry();
1215
+ import { program } from "commander";
1216
+ program.name("windows-use").description("Run Windows/browser automation tasks using a small LLM agent").argument("[instruction]", "The task to perform").option("--api-key <key>", "LLM API key").option("--base-url <url>", "OpenAI-compatible base URL").option("--model <name>", "Model name").option("--cdp-url <url>", "Chrome CDP URL (default: http://localhost:9222)").option("--max-steps <n>", "Max steps before forced stop", parseInt).option("--mcp", "Start as MCP server instead of running a task").action(async (instruction, opts) => {
1217
+ if (opts.mcp || !instruction) {
1218
+ await init_server().then(() => server_exports);
1219
+ return;
1220
+ }
1221
+ let config;
1222
+ try {
1223
+ config = loadConfig({
1224
+ apiKey: opts.apiKey,
1225
+ baseURL: opts.baseUrl,
1226
+ model: opts.model,
1227
+ cdpUrl: opts.cdpUrl,
1228
+ maxSteps: opts.maxSteps
1229
+ });
1230
+ } catch (err) {
1231
+ console.error(
1232
+ "Configuration error. Set WINDOWS_USE_API_KEY, WINDOWS_USE_BASE_URL, WINDOWS_USE_MODEL env vars or pass --api-key, --base-url, --model flags."
1233
+ );
1234
+ console.error(err instanceof Error ? err.message : err);
1235
+ process.exit(1);
1236
+ }
1237
+ const registry2 = new SessionRegistry();
1238
+ const session = registry2.create(config);
1239
+ console.error(`[windows-use] Session ${session.id} created`);
1240
+ console.error(`[windows-use] Running: "${instruction}"`);
1241
+ try {
1242
+ const result = await session.runner.run(instruction);
1243
+ console.log(JSON.stringify(result, null, 2));
1244
+ await registry2.destroy(session.id);
1245
+ process.exit(result.status === "completed" ? 0 : 1);
1246
+ } catch (err) {
1247
+ console.error("Fatal error:", err instanceof Error ? err.message : err);
1248
+ await registry2.destroyAll();
1249
+ process.exit(1);
1250
+ }
1251
+ });
1252
+ program.parse();
1253
+ //# sourceMappingURL=cli.js.map