windows-use 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,3 +1,8 @@
1
+ // src/config/loader.ts
2
+ import { readFileSync, existsSync } from "fs";
3
+ import { join } from "path";
4
+ import { homedir } from "os";
5
+
1
6
  // src/config/schema.ts
2
7
  import { z } from "zod";
3
8
  var ConfigSchema = z.object({
@@ -5,21 +10,31 @@ var ConfigSchema = z.object({
5
10
  baseURL: z.string().url("Must be a valid URL"),
6
11
  model: z.string().min(1, "Model name is required"),
7
12
  maxSteps: z.number().int().positive().default(50),
8
- contextWindowSize: z.number().int().positive().default(20),
13
+ maxRounds: z.number().int().positive().default(20),
9
14
  cdpUrl: z.string().default("http://localhost:9222"),
10
15
  timeoutMs: z.number().default(3e5)
11
16
  });
12
17
 
13
18
  // src/config/loader.ts
19
+ var CONFIG_FILE = join(homedir(), ".windows-use.json");
20
+ function loadFileConfig() {
21
+ if (!existsSync(CONFIG_FILE)) return {};
22
+ try {
23
+ return JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
24
+ } catch {
25
+ return {};
26
+ }
27
+ }
14
28
  function loadConfig(overrides) {
29
+ const file = loadFileConfig();
15
30
  const raw = {
16
- apiKey: overrides?.apiKey ?? process.env.WINDOWS_USE_API_KEY ?? "",
17
- baseURL: overrides?.baseURL ?? process.env.WINDOWS_USE_BASE_URL ?? "",
18
- model: overrides?.model ?? process.env.WINDOWS_USE_MODEL ?? "",
19
- maxSteps: overrides?.maxSteps ?? intEnv("WINDOWS_USE_MAX_STEPS") ?? 50,
20
- contextWindowSize: overrides?.contextWindowSize ?? intEnv("WINDOWS_USE_CONTEXT_WINDOW") ?? 20,
21
- cdpUrl: overrides?.cdpUrl ?? process.env.WINDOWS_USE_CDP_URL ?? "http://localhost:9222",
22
- timeoutMs: overrides?.timeoutMs ?? intEnv("WINDOWS_USE_TIMEOUT_MS") ?? 3e5
31
+ apiKey: overrides?.apiKey ?? process.env.WINDOWS_USE_API_KEY ?? file.apiKey ?? "",
32
+ baseURL: overrides?.baseURL ?? process.env.WINDOWS_USE_BASE_URL ?? file.baseURL ?? "",
33
+ model: overrides?.model ?? process.env.WINDOWS_USE_MODEL ?? file.model ?? "",
34
+ maxSteps: overrides?.maxSteps ?? intEnv("WINDOWS_USE_MAX_STEPS") ?? file.maxSteps ?? 50,
35
+ maxRounds: overrides?.maxRounds ?? intEnv("WINDOWS_USE_MAX_ROUNDS") ?? file.maxRounds ?? 20,
36
+ cdpUrl: overrides?.cdpUrl ?? process.env.WINDOWS_USE_CDP_URL ?? file.cdpUrl ?? "http://localhost:9222",
37
+ timeoutMs: overrides?.timeoutMs ?? intEnv("WINDOWS_USE_TIMEOUT_MS") ?? file.timeoutMs ?? 3e5
23
38
  };
24
39
  return ConfigSchema.parse(raw);
25
40
  }
@@ -36,23 +51,14 @@ import crypto from "crypto";
36
51
  // src/agent/context-manager.ts
37
52
  var ContextManager = class {
38
53
  messages = [];
39
- maxMessages;
40
- constructor(maxMessages) {
41
- this.maxMessages = maxMessages;
42
- }
43
54
  append(message) {
44
55
  this.messages.push(message);
45
56
  }
46
- /** Returns the system prompt + the most recent messages within the window. */
47
- getWindow() {
48
- if (this.messages.length === 0) return [];
49
- const systemPrompt = this.messages[0]?.role === "system" ? this.messages[0] : null;
50
- const nonSystem = systemPrompt ? this.messages.slice(1) : this.messages;
51
- const windowSize = this.maxMessages - (systemPrompt ? 1 : 0);
52
- const windowed = nonSystem.length > windowSize ? nonSystem.slice(-windowSize) : nonSystem;
53
- return systemPrompt ? [systemPrompt, ...windowed] : windowed;
57
+ /** Returns all messages. */
58
+ getMessages() {
59
+ return [...this.messages];
54
60
  }
55
- /** Total messages stored (before windowing). */
61
+ /** Total messages stored. */
56
62
  get length() {
57
63
  return this.messages.length;
58
64
  }
@@ -106,7 +112,16 @@ Call \`report\` when:
106
112
  - **"blocked"**: You cannot proceed (CAPTCHA, login wall, unexpected error). Explain what's blocking you.
107
113
  - **"need_guidance"**: You need a decision or clarification. Describe what you need.
108
114
 
109
- Calling \`report\` stops your execution. Include a concise summary and optionally a screenshot as evidence.
115
+ Calling \`report\` stops your execution. The \`content\` field supports a rich document format \u2014 mix text with screenshots using \`[Image:img_X]\` markers:
116
+
117
+ \`\`\`
118
+ report({
119
+ status: "completed",
120
+ content: "Here is what I found:\\n[Image:img_2]\\nThe page shows the search results.\\n[Image:img_3]\\nI also checked the sidebar."
121
+ })
122
+ \`\`\`
123
+
124
+ Each screenshot tool returns a screenshot ID (e.g. img_1, img_2). Use these IDs to embed images in your report.
110
125
 
111
126
  ## Important
112
127
  - Do NOT keep retrying the same failing action. If something fails twice, call \`report\` with status "blocked".
@@ -122,6 +137,8 @@ var AgentRunner = class {
122
137
  config;
123
138
  toolContext;
124
139
  initialized = false;
140
+ onStep = null;
141
+ roundsUsed = 0;
125
142
  constructor(llmClient, contextManager, toolRegistry, config, toolContext) {
126
143
  this.llmClient = llmClient;
127
144
  this.contextManager = contextManager;
@@ -129,7 +146,30 @@ var AgentRunner = class {
129
146
  this.config = config;
130
147
  this.toolContext = toolContext;
131
148
  }
149
+ /** Register a callback to receive step-by-step progress events */
150
+ setOnStep(cb) {
151
+ this.onStep = cb;
152
+ }
153
+ emit(event) {
154
+ this.onStep?.(event);
155
+ }
156
+ /** How many instruction rounds have been used in this session */
157
+ get currentRound() {
158
+ return this.roundsUsed;
159
+ }
160
+ /** Whether this session has exhausted its max rounds */
161
+ get roundsExhausted() {
162
+ return this.roundsUsed >= this.config.maxRounds;
163
+ }
132
164
  async run(instruction) {
165
+ if (this.roundsExhausted) {
166
+ return {
167
+ status: "blocked",
168
+ content: `Session has reached the maximum number of instruction rounds (${this.config.maxRounds}). Create a new session to continue.`,
169
+ stepsUsed: 0
170
+ };
171
+ }
172
+ this.roundsUsed++;
133
173
  if (!this.initialized) {
134
174
  this.contextManager.append({
135
175
  role: "system",
@@ -145,7 +185,7 @@ var AgentRunner = class {
145
185
  while (stepsUsed < this.config.maxSteps) {
146
186
  stepsUsed++;
147
187
  const remaining = this.config.maxSteps - stepsUsed;
148
- const messages = this.contextManager.getWindow();
188
+ const messages = this.contextManager.getMessages();
149
189
  if (remaining <= 3 && remaining >= 0) {
150
190
  messages.push({
151
191
  role: "system",
@@ -158,9 +198,10 @@ var AgentRunner = class {
158
198
  response = await this.llmClient.chat(messages, tools);
159
199
  } catch (err) {
160
200
  const msg = err instanceof Error ? err.message : String(err);
201
+ this.emit({ type: "error", step: stepsUsed, message: `LLM API error: ${msg}` });
161
202
  return {
162
203
  status: "blocked",
163
- summary: `LLM API error: ${msg}`,
204
+ content: `LLM API error: ${msg}`,
164
205
  stepsUsed
165
206
  };
166
207
  }
@@ -168,26 +209,31 @@ var AgentRunner = class {
168
209
  if (!choice) {
169
210
  return {
170
211
  status: "blocked",
171
- summary: "LLM returned empty response",
212
+ content: "LLM returned empty response",
172
213
  stepsUsed
173
214
  };
174
215
  }
175
216
  const message = choice.message;
217
+ if (message.content) {
218
+ this.emit({ type: "thinking", step: stepsUsed, content: message.content });
219
+ }
176
220
  if (choice.finish_reason === "stop" || !message.tool_calls?.length) {
177
221
  const text = message.content ?? "";
178
222
  this.contextManager.append({ role: "assistant", content: text });
179
223
  return {
180
224
  status: "need_guidance",
181
- summary: text || "Agent stopped without calling report.",
225
+ content: text || "Agent stopped without calling report.",
182
226
  stepsUsed
183
227
  };
184
228
  }
185
229
  this.contextManager.append(message);
186
230
  for (const toolCall of message.tool_calls) {
231
+ const toolName = toolCall.function.name;
187
232
  let args;
188
233
  try {
189
234
  args = JSON.parse(toolCall.function.arguments);
190
235
  } catch {
236
+ this.emit({ type: "error", step: stepsUsed, message: `Failed to parse args for ${toolName}` });
191
237
  this.contextManager.append({
192
238
  role: "tool",
193
239
  tool_call_id: toolCall.id,
@@ -195,15 +241,17 @@ var AgentRunner = class {
195
241
  });
196
242
  continue;
197
243
  }
244
+ this.emit({ type: "tool_call", step: stepsUsed, name: toolName, args });
198
245
  let result;
199
246
  try {
200
247
  result = await this.toolRegistry.execute(
201
- toolCall.function.name,
248
+ toolName,
202
249
  args,
203
250
  this.toolContext
204
251
  );
205
252
  } catch (err) {
206
253
  const msg = err instanceof Error ? err.message : String(err);
254
+ this.emit({ type: "error", step: stepsUsed, message: `${toolName} failed: ${msg}` });
207
255
  this.contextManager.append({
208
256
  role: "tool",
209
257
  tool_call_id: toolCall.id,
@@ -212,6 +260,7 @@ var AgentRunner = class {
212
260
  continue;
213
261
  }
214
262
  if (result.type === "report") {
263
+ this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: `[${result.status}] report submitted` });
215
264
  this.contextManager.append({
216
265
  role: "tool",
217
266
  tool_call_id: toolCall.id,
@@ -219,18 +268,18 @@ var AgentRunner = class {
219
268
  });
220
269
  return {
221
270
  status: result.status,
222
- summary: result.summary,
223
- screenshot: result.screenshot,
271
+ content: result.content,
224
272
  data: result.data,
225
273
  stepsUsed
226
274
  };
227
275
  }
228
276
  if (result.type === "image") {
277
+ this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: `Screenshot captured (${result.screenshotId})` });
229
278
  this.contextManager.append({
230
279
  role: "tool",
231
280
  tool_call_id: toolCall.id,
232
281
  content: [
233
- { type: "text", text: "Screenshot captured." },
282
+ { type: "text", text: `Screenshot captured. ID: ${result.screenshotId}` },
234
283
  {
235
284
  type: "image_url",
236
285
  image_url: {
@@ -240,6 +289,8 @@ var AgentRunner = class {
240
289
  ]
241
290
  });
242
291
  } else {
292
+ const preview = result.content.length > 200 ? result.content.slice(0, 200) + "..." : result.content;
293
+ this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: preview });
243
294
  this.contextManager.append({
244
295
  role: "tool",
245
296
  tool_call_id: toolCall.id,
@@ -250,30 +301,208 @@ var AgentRunner = class {
250
301
  }
251
302
  return {
252
303
  status: "blocked",
253
- summary: `Exceeded maximum steps limit (${this.config.maxSteps}). Task may be incomplete.`,
304
+ content: `Exceeded maximum steps limit (${this.config.maxSteps}). Task may be incomplete.`,
254
305
  stepsUsed
255
306
  };
256
307
  }
257
308
  };
258
309
 
259
310
  // src/tools/browser/client.ts
311
+ import { existsSync as existsSync2, mkdirSync, cpSync, readdirSync } from "fs";
312
+ import { spawn, execSync } from "child_process";
313
+ import { join as join2 } from "path";
314
+ import { homedir as homedir2 } from "os";
315
+ var CHROME_PATHS = [
316
+ // Windows
317
+ "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
318
+ "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
319
+ `${process.env.LOCALAPPDATA ?? ""}\\Google\\Chrome\\Application\\chrome.exe`,
320
+ // macOS
321
+ "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
322
+ // Linux
323
+ "/usr/bin/google-chrome",
324
+ "/usr/bin/google-chrome-stable",
325
+ "/usr/bin/chromium-browser",
326
+ "/usr/bin/chromium"
327
+ ];
328
+ var SKIP_DIRS = /* @__PURE__ */ new Set([
329
+ "Cache",
330
+ "Code Cache",
331
+ "GPUCache",
332
+ "Service Worker",
333
+ "CacheStorage",
334
+ "File System",
335
+ "blob_storage",
336
+ "IndexedDB",
337
+ "DawnCache",
338
+ "GrShaderCache",
339
+ "ShaderCache",
340
+ "optimization_guide_model_store",
341
+ "BrowserMetrics",
342
+ "Crashpad",
343
+ "component_crx_cache"
344
+ ]);
345
+ function findChrome() {
346
+ for (const p of CHROME_PATHS) {
347
+ if (p && existsSync2(p)) return p;
348
+ }
349
+ return null;
350
+ }
351
+ function findUserDataDir() {
352
+ const candidates = [
353
+ // Windows
354
+ join2(process.env.LOCALAPPDATA ?? "", "Google", "Chrome", "User Data"),
355
+ // macOS
356
+ join2(homedir2(), "Library", "Application Support", "Google", "Chrome"),
357
+ // Linux
358
+ join2(homedir2(), ".config", "google-chrome"),
359
+ join2(homedir2(), ".config", "chromium")
360
+ ];
361
+ for (const p of candidates) {
362
+ if (p && existsSync2(p)) return p;
363
+ }
364
+ return null;
365
+ }
366
+ function getCdpPort(cdpUrl) {
367
+ try {
368
+ return parseInt(new URL(cdpUrl).port, 10) || 9222;
369
+ } catch {
370
+ return 9222;
371
+ }
372
+ }
373
+ function isChromeRunning() {
374
+ try {
375
+ if (process.platform === "win32") {
376
+ const out = execSync('tasklist /FI "IMAGENAME eq chrome.exe" /NH', {
377
+ encoding: "utf-8",
378
+ windowsHide: true
379
+ });
380
+ return out.includes("chrome.exe");
381
+ } else {
382
+ execSync('pgrep -x "chrome|chromium|google-chrome"', { encoding: "utf-8" });
383
+ return true;
384
+ }
385
+ } catch {
386
+ return false;
387
+ }
388
+ }
389
+ function syncProfile(sourceDir, targetDir) {
390
+ mkdirSync(targetDir, { recursive: true });
391
+ const entries = readdirSync(sourceDir, { withFileTypes: true });
392
+ for (const entry of entries) {
393
+ const src = join2(sourceDir, entry.name);
394
+ const dst = join2(targetDir, entry.name);
395
+ if (entry.isFile()) {
396
+ try {
397
+ cpSync(src, dst, { force: true });
398
+ } catch {
399
+ }
400
+ } else if (entry.isDirectory()) {
401
+ if (entry.name === "Default" || entry.name.startsWith("Profile ")) {
402
+ syncProfileDir(src, dst);
403
+ } else if (!SKIP_DIRS.has(entry.name)) {
404
+ try {
405
+ cpSync(src, dst, { recursive: true, force: true });
406
+ } catch {
407
+ }
408
+ }
409
+ }
410
+ }
411
+ }
412
+ function syncProfileDir(sourceDir, targetDir) {
413
+ mkdirSync(targetDir, { recursive: true });
414
+ let entries;
415
+ try {
416
+ entries = readdirSync(sourceDir, { withFileTypes: true });
417
+ } catch {
418
+ return;
419
+ }
420
+ for (const entry of entries) {
421
+ if (SKIP_DIRS.has(entry.name)) continue;
422
+ const src = join2(sourceDir, entry.name);
423
+ const dst = join2(targetDir, entry.name);
424
+ try {
425
+ if (entry.isFile()) {
426
+ cpSync(src, dst, { force: true });
427
+ } else if (entry.isDirectory()) {
428
+ cpSync(src, dst, { recursive: true, force: true });
429
+ }
430
+ } catch {
431
+ }
432
+ }
433
+ }
260
434
  var BrowserClient = class {
261
435
  browser = null;
262
436
  context = null;
263
437
  _page = null;
264
438
  cdpUrl;
439
+ chromeProcess = null;
265
440
  constructor(cdpUrl) {
266
441
  this.cdpUrl = cdpUrl;
267
442
  }
268
443
  async connect() {
269
444
  if (this.browser) return;
270
445
  const { chromium } = await import("playwright");
271
- this.browser = await chromium.connectOverCDP(this.cdpUrl);
446
+ try {
447
+ this.browser = await chromium.connectOverCDP(this.cdpUrl);
448
+ } catch {
449
+ await this.launchChrome();
450
+ this.browser = await chromium.connectOverCDP(this.cdpUrl);
451
+ }
272
452
  const contexts = this.browser.contexts();
273
453
  this.context = contexts[0] ?? await this.browser.newContext();
274
454
  const pages = this.context.pages();
275
455
  this._page = pages[0] ?? await this.context.newPage();
276
456
  }
457
+ async launchChrome() {
458
+ const chromePath = findChrome();
459
+ if (!chromePath) {
460
+ throw new Error(
461
+ "Chrome not found. Please install Chrome or start it manually with: chrome --remote-debugging-port=9222"
462
+ );
463
+ }
464
+ const port = getCdpPort(this.cdpUrl);
465
+ if (isChromeRunning()) {
466
+ console.error("[windows-use] Chrome is running without CDP. Restarting with --remote-debugging-port...");
467
+ try {
468
+ if (process.platform === "win32") {
469
+ execSync("taskkill /F /IM chrome.exe /T", { windowsHide: true, stdio: "ignore" });
470
+ } else {
471
+ execSync("pkill -f chrome", { stdio: "ignore" });
472
+ }
473
+ } catch {
474
+ }
475
+ await new Promise((r) => setTimeout(r, 1500));
476
+ }
477
+ const targetDir = join2(homedir2(), ".windows-use", "chrome-profile");
478
+ const userDir = findUserDataDir();
479
+ if (userDir) {
480
+ console.error("[windows-use] Syncing Chrome profile (cookies, login state)...");
481
+ syncProfile(userDir, targetDir);
482
+ console.error("[windows-use] Profile synced.");
483
+ } else {
484
+ mkdirSync(targetDir, { recursive: true });
485
+ }
486
+ console.error(`[windows-use] Launching Chrome with --remote-debugging-port=${port}`);
487
+ this.chromeProcess = spawn(
488
+ chromePath,
489
+ [
490
+ `--remote-debugging-port=${port}`,
491
+ `--user-data-dir=${targetDir}`
492
+ ],
493
+ { detached: true, stdio: "ignore" }
494
+ );
495
+ this.chromeProcess.unref();
496
+ for (let i = 0; i < 30; i++) {
497
+ try {
498
+ const res = await fetch(`http://localhost:${port}/json/version`);
499
+ if (res.ok) return;
500
+ } catch {
501
+ }
502
+ await new Promise((r) => setTimeout(r, 500));
503
+ }
504
+ throw new Error("Chrome launched but CDP endpoint did not become available within 15s");
505
+ }
277
506
  async getPage() {
278
507
  await this.connect();
279
508
  return this._page;
@@ -408,11 +637,69 @@ var ToolRegistry = class {
408
637
 
409
638
  // src/tools/windows/screenshot.ts
410
639
  import { z as z2 } from "zod";
640
+ import sharp2 from "sharp";
641
+
642
+ // src/tools/windows/grid-overlay.ts
643
+ import sharp from "sharp";
644
+ async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
645
+ const gridSpacing = options.gridSpacing ?? 100;
646
+ const labelSpacing = options.labelSpacing ?? 200;
647
+ const majorSpacing = gridSpacing * 5;
648
+ const svgParts = [];
649
+ for (let x = gridSpacing; x < width; x += gridSpacing) {
650
+ const isMajor = x % majorSpacing === 0;
651
+ const opacity = isMajor ? 0.35 : 0.15;
652
+ const sw = isMajor ? 1.5 : 0.5;
653
+ svgParts.push(
654
+ `<line x1="${x}" y1="0" x2="${x}" y2="${height}" stroke="rgba(255,50,50,${opacity})" stroke-width="${sw}"/>`
655
+ );
656
+ }
657
+ for (let y = gridSpacing; y < height; y += gridSpacing) {
658
+ const isMajor = y % majorSpacing === 0;
659
+ const opacity = isMajor ? 0.35 : 0.15;
660
+ const sw = isMajor ? 1.5 : 0.5;
661
+ svgParts.push(
662
+ `<line x1="0" y1="${y}" x2="${width}" y2="${y}" stroke="rgba(255,50,50,${opacity})" stroke-width="${sw}"/>`
663
+ );
664
+ }
665
+ for (let x = labelSpacing; x < width; x += labelSpacing) {
666
+ const text = String(x);
667
+ const tw = text.length * 7.5 + 6;
668
+ svgParts.push(
669
+ `<rect x="${x - tw / 2}" y="2" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
670
+ `<text x="${x}" y="14" text-anchor="middle" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${text}</text>`
671
+ );
672
+ }
673
+ for (let y = labelSpacing; y < height; y += labelSpacing) {
674
+ const text = String(y);
675
+ const tw = text.length * 7.5 + 6;
676
+ svgParts.push(
677
+ `<rect x="2" y="${y - 8}" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
678
+ `<text x="5" y="${y + 4}" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${text}</text>`
679
+ );
680
+ }
681
+ svgParts.push(
682
+ `<rect x="2" y="2" width="22" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
683
+ `<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">0,0</text>`
684
+ );
685
+ const dimText = `${width}x${height}`;
686
+ const dimTw = dimText.length * 7.5 + 6;
687
+ svgParts.push(
688
+ `<rect x="${width - dimTw - 2}" y="${height - 18}" width="${dimTw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
689
+ `<text x="${width - dimTw / 2 - 2}" y="${height - 6}" text-anchor="middle" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${dimText}</text>`
690
+ );
691
+ const svg = Buffer.from(
692
+ `<svg width="${width}" height="${height}" xmlns="http://www.w3.org/2000/svg">${svgParts.join("")}</svg>`
693
+ );
694
+ return sharp(imageBuffer).composite([{ input: svg, top: 0, left: 0 }]).jpeg({ quality: 70 }).toBuffer();
695
+ }
696
+
697
+ // src/tools/windows/screenshot.ts
411
698
  var screenshotTool = {
412
699
  name: "screenshot",
413
- description: "Capture the full screen and return it as an image. Use this to see what is currently displayed.",
700
+ description: "Capture the full screen with a coordinate grid overlay. The grid shows pixel coordinates that match mouse_click/mouse_move coordinates. Returns a screenshot ID.",
414
701
  parameters: z2.object({}),
415
- async execute() {
702
+ async execute(_args, ctx) {
416
703
  const { Monitor } = await import("node-screenshots");
417
704
  const monitors = Monitor.all();
418
705
  const primary = monitors.find((m) => m.isPrimary()) ?? monitors[0];
@@ -420,11 +707,24 @@ var screenshotTool = {
420
707
  return { type: "text", content: "Error: No monitor found" };
421
708
  }
422
709
  const image = primary.captureImageSync();
423
- const buf = image.toPngSync();
710
+ const physW = image.width;
711
+ const physH = image.height;
712
+ const scaleFactor = primary.scaleFactor ?? 1;
713
+ const logicalW = Math.round(physW / scaleFactor);
714
+ const logicalH = Math.round(physH / scaleFactor);
715
+ const raw = image.toRawSync();
716
+ const resized = await sharp2(raw, {
717
+ raw: { width: physW, height: physH, channels: 4 }
718
+ }).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
719
+ const cleanBase64 = resized.toString("base64");
720
+ const id = ctx.screenshots.save(cleanBase64, "image/jpeg", "desktop");
721
+ const gridImage = await addCoordinateGrid(resized, logicalW, logicalH);
722
+ const gridBase64 = gridImage.toString("base64");
424
723
  return {
425
724
  type: "image",
426
- base64: buf.toString("base64"),
427
- mimeType: "image/png"
725
+ base64: gridBase64,
726
+ mimeType: "image/jpeg",
727
+ screenshotId: id
428
728
  };
429
729
  }
430
730
  };
@@ -659,13 +959,46 @@ var fileWriteTool = {
659
959
  }
660
960
  };
661
961
 
662
- // src/tools/browser/navigate.ts
962
+ // src/tools/file/image.ts
663
963
  import { z as z8 } from "zod";
964
+ import { readFileSync as readFileSync2, existsSync as existsSync3 } from "fs";
965
+ import { extname } from "path";
966
+ var IMAGE_EXTS = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".bmp", ".webp"]);
967
+ var useLocalImageTool = {
968
+ name: "use_local_image",
969
+ description: "Load a local image file and get a screenshot ID for it. Use this to reference local images in your report via [Image:img_X].",
970
+ parameters: z8.object({
971
+ path: z8.string().describe("Absolute path to the image file"),
972
+ label: z8.string().default("local").describe('Label for the image (e.g. "chart", "photo")')
973
+ }),
974
+ async execute(args, ctx) {
975
+ if (!existsSync3(args.path)) {
976
+ return { type: "text", content: `Error: File not found: ${args.path}` };
977
+ }
978
+ const ext = extname(args.path).toLowerCase();
979
+ if (!IMAGE_EXTS.has(ext)) {
980
+ return { type: "text", content: `Error: Not a supported image format (${ext}). Supported: ${[...IMAGE_EXTS].join(", ")}` };
981
+ }
982
+ const buf = readFileSync2(args.path);
983
+ const mimeType = ext === ".png" ? "image/png" : "image/jpeg";
984
+ const base64 = buf.toString("base64");
985
+ const id = ctx.screenshots.save(base64, mimeType, args.label);
986
+ return {
987
+ type: "image",
988
+ base64,
989
+ mimeType,
990
+ screenshotId: id
991
+ };
992
+ }
993
+ };
994
+
995
+ // src/tools/browser/navigate.ts
996
+ import { z as z9 } from "zod";
664
997
  var browserNavigateTool = {
665
998
  name: "browser_navigate",
666
999
  description: "Navigate the browser to a URL.",
667
- parameters: z8.object({
668
- url: z8.string().describe("The URL to navigate to")
1000
+ parameters: z9.object({
1001
+ url: z9.string().describe("The URL to navigate to")
669
1002
  }),
670
1003
  async execute(args, ctx) {
671
1004
  const browser = await ctx.getBrowser();
@@ -678,12 +1011,12 @@ Page title: ${title}` };
678
1011
  };
679
1012
 
680
1013
  // src/tools/browser/click.ts
681
- import { z as z9 } from "zod";
1014
+ import { z as z10 } from "zod";
682
1015
  var browserClickTool = {
683
1016
  name: "browser_click",
684
1017
  description: "Click an element on the web page using a CSS selector or text content.",
685
- parameters: z9.object({
686
- selector: z9.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
1018
+ parameters: z10.object({
1019
+ selector: z10.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
687
1020
  }),
688
1021
  async execute(args, ctx) {
689
1022
  const browser = await ctx.getBrowser();
@@ -694,14 +1027,14 @@ var browserClickTool = {
694
1027
  };
695
1028
 
696
1029
  // src/tools/browser/type.ts
697
- import { z as z10 } from "zod";
1030
+ import { z as z11 } from "zod";
698
1031
  var browserTypeTool = {
699
1032
  name: "browser_type",
700
1033
  description: "Type text into an input field on the web page.",
701
- parameters: z10.object({
702
- selector: z10.string().describe("CSS selector for the input element"),
703
- text: z10.string().describe("Text to type"),
704
- clear: z10.boolean().default(true).describe("Whether to clear the field before typing")
1034
+ parameters: z11.object({
1035
+ selector: z11.string().describe("CSS selector for the input element"),
1036
+ text: z11.string().describe("Text to type"),
1037
+ clear: z11.boolean().default(true).describe("Whether to clear the field before typing")
705
1038
  }),
706
1039
  async execute(args, ctx) {
707
1040
  const browser = await ctx.getBrowser();
@@ -716,35 +1049,40 @@ var browserTypeTool = {
716
1049
  };
717
1050
 
718
1051
  // src/tools/browser/screenshot.ts
719
- import { z as z11 } from "zod";
1052
+ import { z as z12 } from "zod";
720
1053
  var browserScreenshotTool = {
721
1054
  name: "browser_screenshot",
722
- description: "Take a screenshot of the current browser page.",
723
- parameters: z11.object({
724
- fullPage: z11.boolean().default(false).describe("Whether to capture the full scrollable page")
1055
+ description: "Take a screenshot of the current browser page. Returns a screenshot ID (e.g. img_2) that you can reference later in report.",
1056
+ parameters: z12.object({
1057
+ fullPage: z12.boolean().default(false).describe("Whether to capture the full scrollable page")
725
1058
  }),
726
1059
  async execute(args, ctx) {
727
1060
  const browser = await ctx.getBrowser();
728
1061
  const page = await browser.getPage();
729
1062
  const buf = await page.screenshot({
730
- type: "png",
731
- fullPage: args.fullPage
1063
+ type: "jpeg",
1064
+ quality: 70,
1065
+ fullPage: args.fullPage,
1066
+ scale: "css"
732
1067
  });
1068
+ const base64 = buf.toString("base64");
1069
+ const id = ctx.screenshots.save(base64, "image/jpeg", "browser");
733
1070
  return {
734
1071
  type: "image",
735
- base64: buf.toString("base64"),
736
- mimeType: "image/png"
1072
+ base64,
1073
+ mimeType: "image/jpeg",
1074
+ screenshotId: id
737
1075
  };
738
1076
  }
739
1077
  };
740
1078
 
741
1079
  // src/tools/browser/content.ts
742
- import { z as z12 } from "zod";
1080
+ import { z as z13 } from "zod";
743
1081
  var MAX_CONTENT_LENGTH = 2e4;
744
1082
  var browserContentTool = {
745
1083
  name: "browser_content",
746
1084
  description: "Get the text content of the current web page. Returns visible text, not HTML.",
747
- parameters: z12.object({}),
1085
+ parameters: z13.object({}),
748
1086
  async execute(_args, ctx) {
749
1087
  const browser = await ctx.getBrowser();
750
1088
  const page = await browser.getPage();
@@ -765,13 +1103,13 @@ ${text}`
765
1103
  };
766
1104
 
767
1105
  // src/tools/browser/scroll.ts
768
- import { z as z13 } from "zod";
1106
+ import { z as z14 } from "zod";
769
1107
  var browserScrollTool = {
770
1108
  name: "browser_scroll",
771
1109
  description: "Scroll the current web page.",
772
- parameters: z13.object({
773
- direction: z13.enum(["up", "down"]).describe("Scroll direction"),
774
- amount: z13.number().positive().default(500).describe("Pixels to scroll")
1110
+ parameters: z14.object({
1111
+ direction: z14.enum(["up", "down"]).describe("Scroll direction"),
1112
+ amount: z14.number().positive().default(500).describe("Pixels to scroll")
775
1113
  }),
776
1114
  async execute(args, ctx) {
777
1115
  const browser = await ctx.getBrowser();
@@ -783,38 +1121,22 @@ var browserScrollTool = {
783
1121
  };
784
1122
 
785
1123
  // src/tools/control/report.ts
786
- import { z as z14 } from "zod";
1124
+ import { z as z15 } from "zod";
787
1125
  var reportTool = {
788
1126
  name: "report",
789
- description: "Report progress back to the caller. Call this when the task is completed, when you are blocked, or when you need guidance. Calling this STOPS your execution immediately.",
790
- parameters: z14.object({
791
- status: z14.enum(["completed", "blocked", "need_guidance"]).describe(
1127
+ description: 'Report progress back to the caller. Call this when the task is completed, when you are blocked, or when you need guidance. Calling this STOPS your execution immediately.\n\nThe content field supports rich document format: mix text with screenshots using [Image:img_1] markers. Example:\n"Here is the current state:\n[Image:img_2]\nThe page shows..."',
1128
+ parameters: z15.object({
1129
+ status: z15.enum(["completed", "blocked", "need_guidance"]).describe(
792
1130
  '"completed" = task done, "blocked" = cannot proceed, "need_guidance" = need a decision'
793
1131
  ),
794
- summary: z14.string().describe("Concise human-readable summary of what was accomplished or what the problem is"),
795
- include_screenshot: z14.boolean().default(false).describe("Whether to capture and include a screenshot of the current state"),
796
- data: z14.unknown().optional().describe("Optional structured data to return")
1132
+ content: z15.string().describe('Rich report content. Use [Image:img_X] to embed screenshots captured earlier. Example: "Task done.\\n[Image:img_1]\\nThe page shows the result."'),
1133
+ data: z15.unknown().optional().describe("Optional structured data to return")
797
1134
  }),
798
1135
  async execute(args) {
799
- let screenshot;
800
- if (args.include_screenshot) {
801
- try {
802
- const { Monitor } = await import("node-screenshots");
803
- const monitors = Monitor.all();
804
- const primary = monitors.find((m) => m.isPrimary()) ?? monitors[0];
805
- if (primary) {
806
- const image = primary.captureImageSync();
807
- const buf = image.toPngSync();
808
- screenshot = buf.toString("base64");
809
- }
810
- } catch {
811
- }
812
- }
813
1136
  return {
814
1137
  type: "report",
815
1138
  status: args.status,
816
- summary: args.summary,
817
- screenshot,
1139
+ content: args.content,
818
1140
  data: args.data
819
1141
  };
820
1142
  }
@@ -832,6 +1154,7 @@ function createToolRegistry() {
832
1154
  registry.register(runCommandTool);
833
1155
  registry.register(fileReadTool);
834
1156
  registry.register(fileWriteTool);
1157
+ registry.register(useLocalImageTool);
835
1158
  registry.register(browserNavigateTool);
836
1159
  registry.register(browserClickTool);
837
1160
  registry.register(browserTypeTool);
@@ -842,21 +1165,73 @@ function createToolRegistry() {
842
1165
  return registry;
843
1166
  }
844
1167
 
1168
+ // src/tools/types.ts
1169
+ var ScreenshotStore = class {
1170
+ counter = 0;
1171
+ store = /* @__PURE__ */ new Map();
1172
+ save(base64, mimeType, label) {
1173
+ this.counter++;
1174
+ const id = `img_${this.counter}`;
1175
+ this.store.set(id, { id, base64, mimeType, label });
1176
+ return id;
1177
+ }
1178
+ get(id) {
1179
+ return this.store.get(id);
1180
+ }
1181
+ listIds() {
1182
+ return [...this.store.keys()];
1183
+ }
1184
+ };
1185
+ function parseReportContent(content, store) {
1186
+ const blocks = [];
1187
+ const regex = /\[Image:(img_\d+)\]/g;
1188
+ let lastIndex = 0;
1189
+ let match;
1190
+ while ((match = regex.exec(content)) !== null) {
1191
+ if (match.index > lastIndex) {
1192
+ blocks.push({ type: "text", text: content.slice(lastIndex, match.index) });
1193
+ }
1194
+ const id = match[1];
1195
+ const screenshot = store.get(id);
1196
+ if (screenshot) {
1197
+ blocks.push({
1198
+ type: "image",
1199
+ id: screenshot.id,
1200
+ base64: screenshot.base64,
1201
+ mimeType: screenshot.mimeType,
1202
+ label: screenshot.label
1203
+ });
1204
+ } else {
1205
+ blocks.push({ type: "text", text: match[0] });
1206
+ }
1207
+ lastIndex = regex.lastIndex;
1208
+ }
1209
+ if (lastIndex < content.length) {
1210
+ blocks.push({ type: "text", text: content.slice(lastIndex) });
1211
+ }
1212
+ return blocks;
1213
+ }
1214
+ function stripImageMarkers(content) {
1215
+ return content.replace(/\[Image:img_\d+\]/g, "").replace(/\n{3,}/g, "\n\n").trim();
1216
+ }
1217
+
845
1218
  // src/mcp/session-registry.ts
846
1219
  var SessionRegistry = class {
847
1220
  sessions = /* @__PURE__ */ new Map();
848
1221
  create(config) {
849
1222
  const id = crypto.randomUUID();
850
- const contextManager = new ContextManager(config.contextWindowSize);
1223
+ const contextManager = new ContextManager();
851
1224
  const llmClient = new LLMClient(config);
852
1225
  const browserClient = new BrowserClient(config.cdpUrl);
853
1226
  const toolRegistry = createToolRegistry();
1227
+ const screenshotStore = new ScreenshotStore();
854
1228
  const toolContext = {
855
1229
  sessionId: id,
856
1230
  cdpUrl: config.cdpUrl,
857
1231
  getBrowser: () => {
858
1232
  return browserClient.connect().then(() => browserClient);
859
- }
1233
+ },
1234
+ screenshots: screenshotStore
860
1235
  };
861
1236
  const runner = new AgentRunner(
862
1237
  llmClient,
@@ -876,6 +1251,7 @@ var SessionRegistry = class {
876
1251
  config,
877
1252
  runner,
878
1253
  browserClient,
1254
+ screenshots: screenshotStore,
879
1255
  timeoutHandle
880
1256
  };
881
1257
  this.sessions.set(id, session);
@@ -913,9 +1289,12 @@ export {
913
1289
  BrowserClient,
914
1290
  ContextManager,
915
1291
  LLMClient,
1292
+ ScreenshotStore,
916
1293
  SessionRegistry,
917
1294
  ToolRegistry,
918
1295
  createToolRegistry,
919
- loadConfig
1296
+ loadConfig,
1297
+ parseReportContent,
1298
+ stripImageMarkers
920
1299
  };
921
1300
  //# sourceMappingURL=index.js.map