windows-use 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,23 +8,14 @@ import crypto from "crypto";
8
8
  // src/agent/context-manager.ts
9
9
  var ContextManager = class {
10
10
  messages = [];
11
- maxMessages;
12
- constructor(maxMessages) {
13
- this.maxMessages = maxMessages;
14
- }
15
11
  append(message) {
16
12
  this.messages.push(message);
17
13
  }
18
- /** Returns the system prompt + the most recent messages within the window. */
19
- getWindow() {
20
- if (this.messages.length === 0) return [];
21
- const systemPrompt = this.messages[0]?.role === "system" ? this.messages[0] : null;
22
- const nonSystem = systemPrompt ? this.messages.slice(1) : this.messages;
23
- const windowSize = this.maxMessages - (systemPrompt ? 1 : 0);
24
- const windowed = nonSystem.length > windowSize ? nonSystem.slice(-windowSize) : nonSystem;
25
- return systemPrompt ? [systemPrompt, ...windowed] : windowed;
14
+ /** Returns all messages. */
15
+ getMessages() {
16
+ return [...this.messages];
26
17
  }
27
- /** Total messages stored (before windowing). */
18
+ /** Total messages stored. */
28
19
  get length() {
29
20
  return this.messages.length;
30
21
  }
@@ -78,7 +69,16 @@ Call \`report\` when:
78
69
  - **"blocked"**: You cannot proceed (CAPTCHA, login wall, unexpected error). Explain what's blocking you.
79
70
  - **"need_guidance"**: You need a decision or clarification. Describe what you need.
80
71
 
81
- Calling \`report\` stops your execution. Include a concise summary and optionally a screenshot as evidence.
72
+ Calling \`report\` stops your execution. The \`content\` field supports a rich document format \u2014 mix text with screenshots using \`[Image:img_X]\` markers:
73
+
74
+ \`\`\`
75
+ report({
76
+ status: "completed",
77
+ content: "Here is what I found:\\n[Image:img_2]\\nThe page shows the search results.\\n[Image:img_3]\\nI also checked the sidebar."
78
+ })
79
+ \`\`\`
80
+
81
+ Each screenshot tool returns a screenshot ID (e.g. img_1, img_2). Use these IDs to embed images in your report.
82
82
 
83
83
  ## Important
84
84
  - Do NOT keep retrying the same failing action. If something fails twice, call \`report\` with status "blocked".
@@ -94,6 +94,8 @@ var AgentRunner = class {
94
94
  config;
95
95
  toolContext;
96
96
  initialized = false;
97
+ onStep = null;
98
+ roundsUsed = 0;
97
99
  constructor(llmClient, contextManager, toolRegistry, config, toolContext) {
98
100
  this.llmClient = llmClient;
99
101
  this.contextManager = contextManager;
@@ -101,7 +103,30 @@ var AgentRunner = class {
101
103
  this.config = config;
102
104
  this.toolContext = toolContext;
103
105
  }
106
+ /** Register a callback to receive step-by-step progress events */
107
+ setOnStep(cb) {
108
+ this.onStep = cb;
109
+ }
110
+ emit(event) {
111
+ this.onStep?.(event);
112
+ }
113
+ /** How many instruction rounds have been used in this session */
114
+ get currentRound() {
115
+ return this.roundsUsed;
116
+ }
117
+ /** Whether this session has exhausted its max rounds */
118
+ get roundsExhausted() {
119
+ return this.roundsUsed >= this.config.maxRounds;
120
+ }
104
121
  async run(instruction) {
122
+ if (this.roundsExhausted) {
123
+ return {
124
+ status: "blocked",
125
+ content: `Session has reached the maximum number of instruction rounds (${this.config.maxRounds}). Create a new session to continue.`,
126
+ stepsUsed: 0
127
+ };
128
+ }
129
+ this.roundsUsed++;
105
130
  if (!this.initialized) {
106
131
  this.contextManager.append({
107
132
  role: "system",
@@ -117,7 +142,7 @@ var AgentRunner = class {
117
142
  while (stepsUsed < this.config.maxSteps) {
118
143
  stepsUsed++;
119
144
  const remaining = this.config.maxSteps - stepsUsed;
120
- const messages = this.contextManager.getWindow();
145
+ const messages = this.contextManager.getMessages();
121
146
  if (remaining <= 3 && remaining >= 0) {
122
147
  messages.push({
123
148
  role: "system",
@@ -130,9 +155,10 @@ var AgentRunner = class {
130
155
  response = await this.llmClient.chat(messages, tools);
131
156
  } catch (err) {
132
157
  const msg = err instanceof Error ? err.message : String(err);
158
+ this.emit({ type: "error", step: stepsUsed, message: `LLM API error: ${msg}` });
133
159
  return {
134
160
  status: "blocked",
135
- summary: `LLM API error: ${msg}`,
161
+ content: `LLM API error: ${msg}`,
136
162
  stepsUsed
137
163
  };
138
164
  }
@@ -140,26 +166,31 @@ var AgentRunner = class {
140
166
  if (!choice) {
141
167
  return {
142
168
  status: "blocked",
143
- summary: "LLM returned empty response",
169
+ content: "LLM returned empty response",
144
170
  stepsUsed
145
171
  };
146
172
  }
147
173
  const message = choice.message;
174
+ if (message.content) {
175
+ this.emit({ type: "thinking", step: stepsUsed, content: message.content });
176
+ }
148
177
  if (choice.finish_reason === "stop" || !message.tool_calls?.length) {
149
178
  const text = message.content ?? "";
150
179
  this.contextManager.append({ role: "assistant", content: text });
151
180
  return {
152
181
  status: "need_guidance",
153
- summary: text || "Agent stopped without calling report.",
182
+ content: text || "Agent stopped without calling report.",
154
183
  stepsUsed
155
184
  };
156
185
  }
157
186
  this.contextManager.append(message);
158
187
  for (const toolCall of message.tool_calls) {
188
+ const toolName = toolCall.function.name;
159
189
  let args;
160
190
  try {
161
191
  args = JSON.parse(toolCall.function.arguments);
162
192
  } catch {
193
+ this.emit({ type: "error", step: stepsUsed, message: `Failed to parse args for ${toolName}` });
163
194
  this.contextManager.append({
164
195
  role: "tool",
165
196
  tool_call_id: toolCall.id,
@@ -167,15 +198,17 @@ var AgentRunner = class {
167
198
  });
168
199
  continue;
169
200
  }
201
+ this.emit({ type: "tool_call", step: stepsUsed, name: toolName, args });
170
202
  let result;
171
203
  try {
172
204
  result = await this.toolRegistry.execute(
173
- toolCall.function.name,
205
+ toolName,
174
206
  args,
175
207
  this.toolContext
176
208
  );
177
209
  } catch (err) {
178
210
  const msg = err instanceof Error ? err.message : String(err);
211
+ this.emit({ type: "error", step: stepsUsed, message: `${toolName} failed: ${msg}` });
179
212
  this.contextManager.append({
180
213
  role: "tool",
181
214
  tool_call_id: toolCall.id,
@@ -184,6 +217,7 @@ var AgentRunner = class {
184
217
  continue;
185
218
  }
186
219
  if (result.type === "report") {
220
+ this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: `[${result.status}] report submitted` });
187
221
  this.contextManager.append({
188
222
  role: "tool",
189
223
  tool_call_id: toolCall.id,
@@ -191,18 +225,18 @@ var AgentRunner = class {
191
225
  });
192
226
  return {
193
227
  status: result.status,
194
- summary: result.summary,
195
- screenshot: result.screenshot,
228
+ content: result.content,
196
229
  data: result.data,
197
230
  stepsUsed
198
231
  };
199
232
  }
200
233
  if (result.type === "image") {
234
+ this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: `Screenshot captured (${result.screenshotId})` });
201
235
  this.contextManager.append({
202
236
  role: "tool",
203
237
  tool_call_id: toolCall.id,
204
238
  content: [
205
- { type: "text", text: "Screenshot captured." },
239
+ { type: "text", text: `Screenshot captured. ID: ${result.screenshotId}` },
206
240
  {
207
241
  type: "image_url",
208
242
  image_url: {
@@ -212,6 +246,8 @@ var AgentRunner = class {
212
246
  ]
213
247
  });
214
248
  } else {
249
+ const preview = result.content.length > 200 ? result.content.slice(0, 200) + "..." : result.content;
250
+ this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: preview });
215
251
  this.contextManager.append({
216
252
  role: "tool",
217
253
  tool_call_id: toolCall.id,
@@ -222,30 +258,208 @@ var AgentRunner = class {
222
258
  }
223
259
  return {
224
260
  status: "blocked",
225
- summary: `Exceeded maximum steps limit (${this.config.maxSteps}). Task may be incomplete.`,
261
+ content: `Exceeded maximum steps limit (${this.config.maxSteps}). Task may be incomplete.`,
226
262
  stepsUsed
227
263
  };
228
264
  }
229
265
  };
230
266
 
231
267
  // src/tools/browser/client.ts
268
+ import { existsSync, mkdirSync, cpSync, readdirSync } from "fs";
269
+ import { spawn, execSync } from "child_process";
270
+ import { join } from "path";
271
+ import { homedir } from "os";
272
+ var CHROME_PATHS = [
273
+ // Windows
274
+ "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
275
+ "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
276
+ `${process.env.LOCALAPPDATA ?? ""}\\Google\\Chrome\\Application\\chrome.exe`,
277
+ // macOS
278
+ "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
279
+ // Linux
280
+ "/usr/bin/google-chrome",
281
+ "/usr/bin/google-chrome-stable",
282
+ "/usr/bin/chromium-browser",
283
+ "/usr/bin/chromium"
284
+ ];
285
+ var SKIP_DIRS = /* @__PURE__ */ new Set([
286
+ "Cache",
287
+ "Code Cache",
288
+ "GPUCache",
289
+ "Service Worker",
290
+ "CacheStorage",
291
+ "File System",
292
+ "blob_storage",
293
+ "IndexedDB",
294
+ "DawnCache",
295
+ "GrShaderCache",
296
+ "ShaderCache",
297
+ "optimization_guide_model_store",
298
+ "BrowserMetrics",
299
+ "Crashpad",
300
+ "component_crx_cache"
301
+ ]);
302
+ function findChrome() {
303
+ for (const p of CHROME_PATHS) {
304
+ if (p && existsSync(p)) return p;
305
+ }
306
+ return null;
307
+ }
308
+ function findUserDataDir() {
309
+ const candidates = [
310
+ // Windows
311
+ join(process.env.LOCALAPPDATA ?? "", "Google", "Chrome", "User Data"),
312
+ // macOS
313
+ join(homedir(), "Library", "Application Support", "Google", "Chrome"),
314
+ // Linux
315
+ join(homedir(), ".config", "google-chrome"),
316
+ join(homedir(), ".config", "chromium")
317
+ ];
318
+ for (const p of candidates) {
319
+ if (p && existsSync(p)) return p;
320
+ }
321
+ return null;
322
+ }
323
+ function getCdpPort(cdpUrl) {
324
+ try {
325
+ return parseInt(new URL(cdpUrl).port, 10) || 9222;
326
+ } catch {
327
+ return 9222;
328
+ }
329
+ }
330
+ function isChromeRunning() {
331
+ try {
332
+ if (process.platform === "win32") {
333
+ const out = execSync('tasklist /FI "IMAGENAME eq chrome.exe" /NH', {
334
+ encoding: "utf-8",
335
+ windowsHide: true
336
+ });
337
+ return out.includes("chrome.exe");
338
+ } else {
339
+ execSync('pgrep -x "chrome|chromium|google-chrome"', { encoding: "utf-8" });
340
+ return true;
341
+ }
342
+ } catch {
343
+ return false;
344
+ }
345
+ }
346
+ function syncProfile(sourceDir, targetDir) {
347
+ mkdirSync(targetDir, { recursive: true });
348
+ const entries = readdirSync(sourceDir, { withFileTypes: true });
349
+ for (const entry of entries) {
350
+ const src = join(sourceDir, entry.name);
351
+ const dst = join(targetDir, entry.name);
352
+ if (entry.isFile()) {
353
+ try {
354
+ cpSync(src, dst, { force: true });
355
+ } catch {
356
+ }
357
+ } else if (entry.isDirectory()) {
358
+ if (entry.name === "Default" || entry.name.startsWith("Profile ")) {
359
+ syncProfileDir(src, dst);
360
+ } else if (!SKIP_DIRS.has(entry.name)) {
361
+ try {
362
+ cpSync(src, dst, { recursive: true, force: true });
363
+ } catch {
364
+ }
365
+ }
366
+ }
367
+ }
368
+ }
369
+ function syncProfileDir(sourceDir, targetDir) {
370
+ mkdirSync(targetDir, { recursive: true });
371
+ let entries;
372
+ try {
373
+ entries = readdirSync(sourceDir, { withFileTypes: true });
374
+ } catch {
375
+ return;
376
+ }
377
+ for (const entry of entries) {
378
+ if (SKIP_DIRS.has(entry.name)) continue;
379
+ const src = join(sourceDir, entry.name);
380
+ const dst = join(targetDir, entry.name);
381
+ try {
382
+ if (entry.isFile()) {
383
+ cpSync(src, dst, { force: true });
384
+ } else if (entry.isDirectory()) {
385
+ cpSync(src, dst, { recursive: true, force: true });
386
+ }
387
+ } catch {
388
+ }
389
+ }
390
+ }
232
391
  var BrowserClient = class {
233
392
  browser = null;
234
393
  context = null;
235
394
  _page = null;
236
395
  cdpUrl;
396
+ chromeProcess = null;
237
397
  constructor(cdpUrl) {
238
398
  this.cdpUrl = cdpUrl;
239
399
  }
240
400
  async connect() {
241
401
  if (this.browser) return;
242
402
  const { chromium } = await import("playwright");
243
- this.browser = await chromium.connectOverCDP(this.cdpUrl);
403
+ try {
404
+ this.browser = await chromium.connectOverCDP(this.cdpUrl);
405
+ } catch {
406
+ await this.launchChrome();
407
+ this.browser = await chromium.connectOverCDP(this.cdpUrl);
408
+ }
244
409
  const contexts = this.browser.contexts();
245
410
  this.context = contexts[0] ?? await this.browser.newContext();
246
411
  const pages = this.context.pages();
247
412
  this._page = pages[0] ?? await this.context.newPage();
248
413
  }
414
+ async launchChrome() {
415
+ const chromePath = findChrome();
416
+ if (!chromePath) {
417
+ throw new Error(
418
+ "Chrome not found. Please install Chrome or start it manually with: chrome --remote-debugging-port=9222"
419
+ );
420
+ }
421
+ const port = getCdpPort(this.cdpUrl);
422
+ if (isChromeRunning()) {
423
+ console.error("[windows-use] Chrome is running without CDP. Restarting with --remote-debugging-port...");
424
+ try {
425
+ if (process.platform === "win32") {
426
+ execSync("taskkill /F /IM chrome.exe /T", { windowsHide: true, stdio: "ignore" });
427
+ } else {
428
+ execSync("pkill -f chrome", { stdio: "ignore" });
429
+ }
430
+ } catch {
431
+ }
432
+ await new Promise((r) => setTimeout(r, 1500));
433
+ }
434
+ const targetDir = join(homedir(), ".windows-use", "chrome-profile");
435
+ const userDir = findUserDataDir();
436
+ if (userDir) {
437
+ console.error("[windows-use] Syncing Chrome profile (cookies, login state)...");
438
+ syncProfile(userDir, targetDir);
439
+ console.error("[windows-use] Profile synced.");
440
+ } else {
441
+ mkdirSync(targetDir, { recursive: true });
442
+ }
443
+ console.error(`[windows-use] Launching Chrome with --remote-debugging-port=${port}`);
444
+ this.chromeProcess = spawn(
445
+ chromePath,
446
+ [
447
+ `--remote-debugging-port=${port}`,
448
+ `--user-data-dir=${targetDir}`
449
+ ],
450
+ { detached: true, stdio: "ignore" }
451
+ );
452
+ this.chromeProcess.unref();
453
+ for (let i = 0; i < 30; i++) {
454
+ try {
455
+ const res = await fetch(`http://localhost:${port}/json/version`);
456
+ if (res.ok) return;
457
+ } catch {
458
+ }
459
+ await new Promise((r) => setTimeout(r, 500));
460
+ }
461
+ throw new Error("Chrome launched but CDP endpoint did not become available within 15s");
462
+ }
249
463
  async getPage() {
250
464
  await this.connect();
251
465
  return this._page;
@@ -380,11 +594,69 @@ var ToolRegistry = class {
380
594
 
381
595
  // src/tools/windows/screenshot.ts
382
596
  import { z } from "zod";
597
+ import sharp2 from "sharp";
598
+
599
+ // src/tools/windows/grid-overlay.ts
600
+ import sharp from "sharp";
601
+ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
602
+ const gridSpacing = options.gridSpacing ?? 100;
603
+ const labelSpacing = options.labelSpacing ?? 200;
604
+ const majorSpacing = gridSpacing * 5;
605
+ const svgParts = [];
606
+ for (let x = gridSpacing; x < width; x += gridSpacing) {
607
+ const isMajor = x % majorSpacing === 0;
608
+ const opacity = isMajor ? 0.35 : 0.15;
609
+ const sw = isMajor ? 1.5 : 0.5;
610
+ svgParts.push(
611
+ `<line x1="${x}" y1="0" x2="${x}" y2="${height}" stroke="rgba(255,50,50,${opacity})" stroke-width="${sw}"/>`
612
+ );
613
+ }
614
+ for (let y = gridSpacing; y < height; y += gridSpacing) {
615
+ const isMajor = y % majorSpacing === 0;
616
+ const opacity = isMajor ? 0.35 : 0.15;
617
+ const sw = isMajor ? 1.5 : 0.5;
618
+ svgParts.push(
619
+ `<line x1="0" y1="${y}" x2="${width}" y2="${y}" stroke="rgba(255,50,50,${opacity})" stroke-width="${sw}"/>`
620
+ );
621
+ }
622
+ for (let x = labelSpacing; x < width; x += labelSpacing) {
623
+ const text = String(x);
624
+ const tw = text.length * 7.5 + 6;
625
+ svgParts.push(
626
+ `<rect x="${x - tw / 2}" y="2" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
627
+ `<text x="${x}" y="14" text-anchor="middle" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${text}</text>`
628
+ );
629
+ }
630
+ for (let y = labelSpacing; y < height; y += labelSpacing) {
631
+ const text = String(y);
632
+ const tw = text.length * 7.5 + 6;
633
+ svgParts.push(
634
+ `<rect x="2" y="${y - 8}" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
635
+ `<text x="5" y="${y + 4}" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${text}</text>`
636
+ );
637
+ }
638
+ svgParts.push(
639
+ `<rect x="2" y="2" width="22" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
640
+ `<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">0,0</text>`
641
+ );
642
+ const dimText = `${width}x${height}`;
643
+ const dimTw = dimText.length * 7.5 + 6;
644
+ svgParts.push(
645
+ `<rect x="${width - dimTw - 2}" y="${height - 18}" width="${dimTw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
646
+ `<text x="${width - dimTw / 2 - 2}" y="${height - 6}" text-anchor="middle" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${dimText}</text>`
647
+ );
648
+ const svg = Buffer.from(
649
+ `<svg width="${width}" height="${height}" xmlns="http://www.w3.org/2000/svg">${svgParts.join("")}</svg>`
650
+ );
651
+ return sharp(imageBuffer).composite([{ input: svg, top: 0, left: 0 }]).jpeg({ quality: 70 }).toBuffer();
652
+ }
653
+
654
+ // src/tools/windows/screenshot.ts
383
655
  var screenshotTool = {
384
656
  name: "screenshot",
385
- description: "Capture the full screen and return it as an image. Use this to see what is currently displayed.",
657
+ description: "Capture the full screen with a coordinate grid overlay. The grid shows pixel coordinates that match mouse_click/mouse_move coordinates. Returns a screenshot ID.",
386
658
  parameters: z.object({}),
387
- async execute() {
659
+ async execute(_args, ctx) {
388
660
  const { Monitor } = await import("node-screenshots");
389
661
  const monitors = Monitor.all();
390
662
  const primary = monitors.find((m) => m.isPrimary()) ?? monitors[0];
@@ -392,11 +664,24 @@ var screenshotTool = {
392
664
  return { type: "text", content: "Error: No monitor found" };
393
665
  }
394
666
  const image = primary.captureImageSync();
395
- const buf = image.toPngSync();
667
+ const physW = image.width;
668
+ const physH = image.height;
669
+ const scaleFactor = primary.scaleFactor ?? 1;
670
+ const logicalW = Math.round(physW / scaleFactor);
671
+ const logicalH = Math.round(physH / scaleFactor);
672
+ const raw = image.toRawSync();
673
+ const resized = await sharp2(raw, {
674
+ raw: { width: physW, height: physH, channels: 4 }
675
+ }).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
676
+ const cleanBase64 = resized.toString("base64");
677
+ const id = ctx.screenshots.save(cleanBase64, "image/jpeg", "desktop");
678
+ const gridImage = await addCoordinateGrid(resized, logicalW, logicalH);
679
+ const gridBase64 = gridImage.toString("base64");
396
680
  return {
397
681
  type: "image",
398
- base64: buf.toString("base64"),
399
- mimeType: "image/png"
682
+ base64: gridBase64,
683
+ mimeType: "image/jpeg",
684
+ screenshotId: id
400
685
  };
401
686
  }
402
687
  };
@@ -631,13 +916,46 @@ var fileWriteTool = {
631
916
  }
632
917
  };
633
918
 
634
- // src/tools/browser/navigate.ts
919
+ // src/tools/file/image.ts
635
920
  import { z as z7 } from "zod";
921
+ import { readFileSync, existsSync as existsSync2 } from "fs";
922
+ import { extname } from "path";
923
+ var IMAGE_EXTS = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".bmp", ".webp"]);
924
+ var useLocalImageTool = {
925
+ name: "use_local_image",
926
+ description: "Load a local image file and get a screenshot ID for it. Use this to reference local images in your report via [Image:img_X].",
927
+ parameters: z7.object({
928
+ path: z7.string().describe("Absolute path to the image file"),
929
+ label: z7.string().default("local").describe('Label for the image (e.g. "chart", "photo")')
930
+ }),
931
+ async execute(args, ctx) {
932
+ if (!existsSync2(args.path)) {
933
+ return { type: "text", content: `Error: File not found: ${args.path}` };
934
+ }
935
+ const ext = extname(args.path).toLowerCase();
936
+ if (!IMAGE_EXTS.has(ext)) {
937
+ return { type: "text", content: `Error: Not a supported image format (${ext}). Supported: ${[...IMAGE_EXTS].join(", ")}` };
938
+ }
939
+ const buf = readFileSync(args.path);
940
+ const mimeType = ext === ".png" ? "image/png" : "image/jpeg";
941
+ const base64 = buf.toString("base64");
942
+ const id = ctx.screenshots.save(base64, mimeType, args.label);
943
+ return {
944
+ type: "image",
945
+ base64,
946
+ mimeType,
947
+ screenshotId: id
948
+ };
949
+ }
950
+ };
951
+
952
+ // src/tools/browser/navigate.ts
953
+ import { z as z8 } from "zod";
636
954
  var browserNavigateTool = {
637
955
  name: "browser_navigate",
638
956
  description: "Navigate the browser to a URL.",
639
- parameters: z7.object({
640
- url: z7.string().describe("The URL to navigate to")
957
+ parameters: z8.object({
958
+ url: z8.string().describe("The URL to navigate to")
641
959
  }),
642
960
  async execute(args, ctx) {
643
961
  const browser = await ctx.getBrowser();
@@ -650,12 +968,12 @@ Page title: ${title}` };
650
968
  };
651
969
 
652
970
  // src/tools/browser/click.ts
653
- import { z as z8 } from "zod";
971
+ import { z as z9 } from "zod";
654
972
  var browserClickTool = {
655
973
  name: "browser_click",
656
974
  description: "Click an element on the web page using a CSS selector or text content.",
657
- parameters: z8.object({
658
- selector: z8.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
975
+ parameters: z9.object({
976
+ selector: z9.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
659
977
  }),
660
978
  async execute(args, ctx) {
661
979
  const browser = await ctx.getBrowser();
@@ -666,14 +984,14 @@ var browserClickTool = {
666
984
  };
667
985
 
668
986
  // src/tools/browser/type.ts
669
- import { z as z9 } from "zod";
987
+ import { z as z10 } from "zod";
670
988
  var browserTypeTool = {
671
989
  name: "browser_type",
672
990
  description: "Type text into an input field on the web page.",
673
- parameters: z9.object({
674
- selector: z9.string().describe("CSS selector for the input element"),
675
- text: z9.string().describe("Text to type"),
676
- clear: z9.boolean().default(true).describe("Whether to clear the field before typing")
991
+ parameters: z10.object({
992
+ selector: z10.string().describe("CSS selector for the input element"),
993
+ text: z10.string().describe("Text to type"),
994
+ clear: z10.boolean().default(true).describe("Whether to clear the field before typing")
677
995
  }),
678
996
  async execute(args, ctx) {
679
997
  const browser = await ctx.getBrowser();
@@ -688,35 +1006,40 @@ var browserTypeTool = {
688
1006
  };
689
1007
 
690
1008
  // src/tools/browser/screenshot.ts
691
- import { z as z10 } from "zod";
1009
+ import { z as z11 } from "zod";
692
1010
  var browserScreenshotTool = {
693
1011
  name: "browser_screenshot",
694
- description: "Take a screenshot of the current browser page.",
695
- parameters: z10.object({
696
- fullPage: z10.boolean().default(false).describe("Whether to capture the full scrollable page")
1012
+ description: "Take a screenshot of the current browser page. Returns a screenshot ID (e.g. img_2) that you can reference later in report.",
1013
+ parameters: z11.object({
1014
+ fullPage: z11.boolean().default(false).describe("Whether to capture the full scrollable page")
697
1015
  }),
698
1016
  async execute(args, ctx) {
699
1017
  const browser = await ctx.getBrowser();
700
1018
  const page = await browser.getPage();
701
1019
  const buf = await page.screenshot({
702
- type: "png",
703
- fullPage: args.fullPage
1020
+ type: "jpeg",
1021
+ quality: 70,
1022
+ fullPage: args.fullPage,
1023
+ scale: "css"
704
1024
  });
1025
+ const base64 = buf.toString("base64");
1026
+ const id = ctx.screenshots.save(base64, "image/jpeg", "browser");
705
1027
  return {
706
1028
  type: "image",
707
- base64: buf.toString("base64"),
708
- mimeType: "image/png"
1029
+ base64,
1030
+ mimeType: "image/jpeg",
1031
+ screenshotId: id
709
1032
  };
710
1033
  }
711
1034
  };
712
1035
 
713
1036
  // src/tools/browser/content.ts
714
- import { z as z11 } from "zod";
1037
+ import { z as z12 } from "zod";
715
1038
  var MAX_CONTENT_LENGTH = 2e4;
716
1039
  var browserContentTool = {
717
1040
  name: "browser_content",
718
1041
  description: "Get the text content of the current web page. Returns visible text, not HTML.",
719
- parameters: z11.object({}),
1042
+ parameters: z12.object({}),
720
1043
  async execute(_args, ctx) {
721
1044
  const browser = await ctx.getBrowser();
722
1045
  const page = await browser.getPage();
@@ -737,13 +1060,13 @@ ${text}`
737
1060
  };
738
1061
 
739
1062
  // src/tools/browser/scroll.ts
740
- import { z as z12 } from "zod";
1063
+ import { z as z13 } from "zod";
741
1064
  var browserScrollTool = {
742
1065
  name: "browser_scroll",
743
1066
  description: "Scroll the current web page.",
744
- parameters: z12.object({
745
- direction: z12.enum(["up", "down"]).describe("Scroll direction"),
746
- amount: z12.number().positive().default(500).describe("Pixels to scroll")
1067
+ parameters: z13.object({
1068
+ direction: z13.enum(["up", "down"]).describe("Scroll direction"),
1069
+ amount: z13.number().positive().default(500).describe("Pixels to scroll")
747
1070
  }),
748
1071
  async execute(args, ctx) {
749
1072
  const browser = await ctx.getBrowser();
@@ -755,38 +1078,22 @@ var browserScrollTool = {
755
1078
  };
756
1079
 
757
1080
  // src/tools/control/report.ts
758
- import { z as z13 } from "zod";
1081
+ import { z as z14 } from "zod";
759
1082
  var reportTool = {
760
1083
  name: "report",
761
- description: "Report progress back to the caller. Call this when the task is completed, when you are blocked, or when you need guidance. Calling this STOPS your execution immediately.",
762
- parameters: z13.object({
763
- status: z13.enum(["completed", "blocked", "need_guidance"]).describe(
1084
+ description: 'Report progress back to the caller. Call this when the task is completed, when you are blocked, or when you need guidance. Calling this STOPS your execution immediately.\n\nThe content field supports rich document format: mix text with screenshots using [Image:img_1] markers. Example:\n"Here is the current state:\n[Image:img_2]\nThe page shows..."',
1085
+ parameters: z14.object({
1086
+ status: z14.enum(["completed", "blocked", "need_guidance"]).describe(
764
1087
  '"completed" = task done, "blocked" = cannot proceed, "need_guidance" = need a decision'
765
1088
  ),
766
- summary: z13.string().describe("Concise human-readable summary of what was accomplished or what the problem is"),
767
- include_screenshot: z13.boolean().default(false).describe("Whether to capture and include a screenshot of the current state"),
768
- data: z13.unknown().optional().describe("Optional structured data to return")
1089
+ content: z14.string().describe('Rich report content. Use [Image:img_X] to embed screenshots captured earlier. Example: "Task done.\\n[Image:img_1]\\nThe page shows the result."'),
1090
+ data: z14.unknown().optional().describe("Optional structured data to return")
769
1091
  }),
770
1092
  async execute(args) {
771
- let screenshot;
772
- if (args.include_screenshot) {
773
- try {
774
- const { Monitor } = await import("node-screenshots");
775
- const monitors = Monitor.all();
776
- const primary = monitors.find((m) => m.isPrimary()) ?? monitors[0];
777
- if (primary) {
778
- const image = primary.captureImageSync();
779
- const buf = image.toPngSync();
780
- screenshot = buf.toString("base64");
781
- }
782
- } catch {
783
- }
784
- }
785
1093
  return {
786
1094
  type: "report",
787
1095
  status: args.status,
788
- summary: args.summary,
789
- screenshot,
1096
+ content: args.content,
790
1097
  data: args.data
791
1098
  };
792
1099
  }
@@ -804,6 +1111,7 @@ function createToolRegistry() {
804
1111
  registry2.register(runCommandTool);
805
1112
  registry2.register(fileReadTool);
806
1113
  registry2.register(fileWriteTool);
1114
+ registry2.register(useLocalImageTool);
807
1115
  registry2.register(browserNavigateTool);
808
1116
  registry2.register(browserClickTool);
809
1117
  registry2.register(browserTypeTool);
@@ -814,21 +1122,70 @@ function createToolRegistry() {
814
1122
  return registry2;
815
1123
  }
816
1124
 
1125
+ // src/tools/types.ts
1126
+ var ScreenshotStore = class {
1127
+ counter = 0;
1128
+ store = /* @__PURE__ */ new Map();
1129
+ save(base64, mimeType, label) {
1130
+ this.counter++;
1131
+ const id = `img_${this.counter}`;
1132
+ this.store.set(id, { id, base64, mimeType, label });
1133
+ return id;
1134
+ }
1135
+ get(id) {
1136
+ return this.store.get(id);
1137
+ }
1138
+ listIds() {
1139
+ return [...this.store.keys()];
1140
+ }
1141
+ };
1142
+ function parseReportContent(content, store) {
1143
+ const blocks = [];
1144
+ const regex = /\[Image:(img_\d+)\]/g;
1145
+ let lastIndex = 0;
1146
+ let match;
1147
+ while ((match = regex.exec(content)) !== null) {
1148
+ if (match.index > lastIndex) {
1149
+ blocks.push({ type: "text", text: content.slice(lastIndex, match.index) });
1150
+ }
1151
+ const id = match[1];
1152
+ const screenshot = store.get(id);
1153
+ if (screenshot) {
1154
+ blocks.push({
1155
+ type: "image",
1156
+ id: screenshot.id,
1157
+ base64: screenshot.base64,
1158
+ mimeType: screenshot.mimeType,
1159
+ label: screenshot.label
1160
+ });
1161
+ } else {
1162
+ blocks.push({ type: "text", text: match[0] });
1163
+ }
1164
+ lastIndex = regex.lastIndex;
1165
+ }
1166
+ if (lastIndex < content.length) {
1167
+ blocks.push({ type: "text", text: content.slice(lastIndex) });
1168
+ }
1169
+ return blocks;
1170
+ }
1171
+
817
1172
  // src/mcp/session-registry.ts
818
1173
  var SessionRegistry = class {
819
1174
  sessions = /* @__PURE__ */ new Map();
820
1175
  create(config) {
821
1176
  const id = crypto.randomUUID();
822
- const contextManager = new ContextManager(config.contextWindowSize);
1177
+ const contextManager = new ContextManager();
823
1178
  const llmClient = new LLMClient(config);
824
1179
  const browserClient = new BrowserClient(config.cdpUrl);
825
1180
  const toolRegistry = createToolRegistry();
1181
+ const screenshotStore = new ScreenshotStore();
826
1182
  const toolContext = {
827
1183
  sessionId: id,
828
1184
  cdpUrl: config.cdpUrl,
829
1185
  getBrowser: () => {
830
1186
  return browserClient.connect().then(() => browserClient);
831
- }
1187
+ },
1188
+ screenshots: screenshotStore
832
1189
  };
833
1190
  const runner = new AgentRunner(
834
1191
  llmClient,
@@ -848,6 +1205,7 @@ var SessionRegistry = class {
848
1205
  config,
849
1206
  runner,
850
1207
  browserClient,
1208
+ screenshots: screenshotStore,
851
1209
  timeoutHandle
852
1210
  };
853
1211
  this.sessions.set(id, session);
@@ -882,30 +1240,45 @@ var SessionRegistry = class {
882
1240
  };
883
1241
 
884
1242
  // src/mcp/tools.ts
885
- import { z as z15 } from "zod";
1243
+ import { z as z16 } from "zod";
1244
+
1245
+ // src/config/loader.ts
1246
+ import { readFileSync as readFileSync2, existsSync as existsSync3 } from "fs";
1247
+ import { join as join2 } from "path";
1248
+ import { homedir as homedir2 } from "os";
886
1249
 
887
1250
  // src/config/schema.ts
888
- import { z as z14 } from "zod";
889
- var ConfigSchema = z14.object({
890
- apiKey: z14.string().min(1, "API key is required"),
891
- baseURL: z14.string().url("Must be a valid URL"),
892
- model: z14.string().min(1, "Model name is required"),
893
- maxSteps: z14.number().int().positive().default(50),
894
- contextWindowSize: z14.number().int().positive().default(20),
895
- cdpUrl: z14.string().default("http://localhost:9222"),
896
- timeoutMs: z14.number().default(3e5)
1251
+ import { z as z15 } from "zod";
1252
+ var ConfigSchema = z15.object({
1253
+ apiKey: z15.string().min(1, "API key is required"),
1254
+ baseURL: z15.string().url("Must be a valid URL"),
1255
+ model: z15.string().min(1, "Model name is required"),
1256
+ maxSteps: z15.number().int().positive().default(50),
1257
+ maxRounds: z15.number().int().positive().default(20),
1258
+ cdpUrl: z15.string().default("http://localhost:9222"),
1259
+ timeoutMs: z15.number().default(3e5)
897
1260
  });
898
1261
 
899
1262
  // src/config/loader.ts
1263
+ var CONFIG_FILE = join2(homedir2(), ".windows-use.json");
1264
+ function loadFileConfig() {
1265
+ if (!existsSync3(CONFIG_FILE)) return {};
1266
+ try {
1267
+ return JSON.parse(readFileSync2(CONFIG_FILE, "utf-8"));
1268
+ } catch {
1269
+ return {};
1270
+ }
1271
+ }
900
1272
  function loadConfig(overrides) {
1273
+ const file = loadFileConfig();
901
1274
  const raw = {
902
- apiKey: overrides?.apiKey ?? process.env.WINDOWS_USE_API_KEY ?? "",
903
- baseURL: overrides?.baseURL ?? process.env.WINDOWS_USE_BASE_URL ?? "",
904
- model: overrides?.model ?? process.env.WINDOWS_USE_MODEL ?? "",
905
- maxSteps: overrides?.maxSteps ?? intEnv("WINDOWS_USE_MAX_STEPS") ?? 50,
906
- contextWindowSize: overrides?.contextWindowSize ?? intEnv("WINDOWS_USE_CONTEXT_WINDOW") ?? 20,
907
- cdpUrl: overrides?.cdpUrl ?? process.env.WINDOWS_USE_CDP_URL ?? "http://localhost:9222",
908
- timeoutMs: overrides?.timeoutMs ?? intEnv("WINDOWS_USE_TIMEOUT_MS") ?? 3e5
1275
+ apiKey: overrides?.apiKey ?? process.env.WINDOWS_USE_API_KEY ?? file.apiKey ?? "",
1276
+ baseURL: overrides?.baseURL ?? process.env.WINDOWS_USE_BASE_URL ?? file.baseURL ?? "",
1277
+ model: overrides?.model ?? process.env.WINDOWS_USE_MODEL ?? file.model ?? "",
1278
+ maxSteps: overrides?.maxSteps ?? intEnv("WINDOWS_USE_MAX_STEPS") ?? file.maxSteps ?? 50,
1279
+ maxRounds: overrides?.maxRounds ?? intEnv("WINDOWS_USE_MAX_ROUNDS") ?? file.maxRounds ?? 20,
1280
+ cdpUrl: overrides?.cdpUrl ?? process.env.WINDOWS_USE_CDP_URL ?? file.cdpUrl ?? "http://localhost:9222",
1281
+ timeoutMs: overrides?.timeoutMs ?? intEnv("WINDOWS_USE_TIMEOUT_MS") ?? file.timeoutMs ?? 3e5
909
1282
  };
910
1283
  return ConfigSchema.parse(raw);
911
1284
  }
@@ -922,12 +1295,13 @@ function registerMcpTools(server2, registry2) {
922
1295
  "create_session",
923
1296
  "Create a new automation session with a small LLM agent. Returns a session_id.",
924
1297
  {
925
- api_key: z15.string().optional().describe("LLM API key (or set WINDOWS_USE_API_KEY env)"),
926
- base_url: z15.string().optional().describe("OpenAI-compatible base URL (or set WINDOWS_USE_BASE_URL env)"),
927
- model: z15.string().optional().describe("Model name (or set WINDOWS_USE_MODEL env)"),
928
- cdp_url: z15.string().optional().describe("Chrome CDP URL (default: http://localhost:9222)"),
929
- timeout_ms: z15.number().optional().describe("Session inactivity timeout in ms (default: 300000)"),
930
- max_steps: z15.number().optional().describe("Max tool-calling steps per instruction (default: 50)")
1298
+ api_key: z16.string().optional().describe("LLM API key (or set WINDOWS_USE_API_KEY env)"),
1299
+ base_url: z16.string().optional().describe("OpenAI-compatible base URL (or set WINDOWS_USE_BASE_URL env)"),
1300
+ model: z16.string().optional().describe("Model name (or set WINDOWS_USE_MODEL env)"),
1301
+ cdp_url: z16.string().optional().describe("Chrome CDP URL (default: http://localhost:9222)"),
1302
+ timeout_ms: z16.number().optional().describe("Session inactivity timeout in ms (default: 300000)"),
1303
+ max_steps: z16.number().optional().describe("Max tool-calling steps per instruction (default: 50)"),
1304
+ max_rounds: z16.number().optional().describe("Max instruction rounds per session (default: 20)")
931
1305
  },
932
1306
  async (args) => {
933
1307
  const config = loadConfig({
@@ -936,7 +1310,8 @@ function registerMcpTools(server2, registry2) {
936
1310
  model: args.model,
937
1311
  cdpUrl: args.cdp_url,
938
1312
  timeoutMs: args.timeout_ms,
939
- maxSteps: args.max_steps
1313
+ maxSteps: args.max_steps,
1314
+ maxRounds: args.max_rounds
940
1315
  });
941
1316
  const session = registry2.create(config);
942
1317
  return {
@@ -951,10 +1326,10 @@ function registerMcpTools(server2, registry2) {
951
1326
  );
952
1327
  server2.tool(
953
1328
  "send_instruction",
954
- "Send a task instruction to the agent in a session. The agent executes it and returns a status report.",
1329
+ "Send a task instruction to the agent in a session. The agent executes it and returns a rich report with text and images.",
955
1330
  {
956
- session_id: z15.string().describe("Session ID from create_session"),
957
- instruction: z15.string().describe("What you want the agent to do, in natural language")
1331
+ session_id: z16.string().describe("Session ID from create_session"),
1332
+ instruction: z16.string().describe("What you want the agent to do, in natural language")
958
1333
  },
959
1334
  async (args) => {
960
1335
  const session = registry2.get(args.session_id);
@@ -974,32 +1349,37 @@ function registerMcpTools(server2, registry2) {
974
1349
  registry2.touch(args.session_id);
975
1350
  const result = await session.runner.run(args.instruction);
976
1351
  registry2.touch(args.session_id);
977
- const content = [
978
- {
979
- type: "text",
980
- text: JSON.stringify({
981
- status: result.status,
982
- summary: result.summary,
983
- steps_used: result.stepsUsed,
984
- ...result.data !== void 0 ? { data: result.data } : {}
985
- })
1352
+ const mcpContent = [];
1353
+ mcpContent.push({
1354
+ type: "text",
1355
+ text: JSON.stringify({
1356
+ status: result.status,
1357
+ steps_used: result.stepsUsed,
1358
+ round: session.runner.currentRound,
1359
+ rounds_remaining: session.config.maxRounds - session.runner.currentRound,
1360
+ ...result.data !== void 0 ? { data: result.data } : {}
1361
+ })
1362
+ });
1363
+ const blocks = parseReportContent(result.content, session.screenshots);
1364
+ for (const block of blocks) {
1365
+ if (block.type === "text") {
1366
+ mcpContent.push({ type: "text", text: block.text });
1367
+ } else {
1368
+ mcpContent.push({
1369
+ type: "image",
1370
+ data: block.base64,
1371
+ mimeType: block.mimeType
1372
+ });
986
1373
  }
987
- ];
988
- if (result.screenshot) {
989
- content.push({
990
- type: "image",
991
- data: result.screenshot,
992
- mimeType: "image/png"
993
- });
994
1374
  }
995
- return { content };
1375
+ return { content: mcpContent };
996
1376
  }
997
1377
  );
998
1378
  server2.tool(
999
1379
  "done_session",
1000
1380
  "Terminate a session and free all resources.",
1001
1381
  {
1002
- session_id: z15.string().describe("Session ID to terminate")
1382
+ session_id: z16.string().describe("Session ID to terminate")
1003
1383
  },
1004
1384
  async (args) => {
1005
1385
  await registry2.destroy(args.session_id);