npm - windows-use - Versions diffs - 0.1.0 → 0.2.0 - Mend

windows-use 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/cli.js CHANGED Viewed

@@ -15,7 +15,7 @@ var init_schema = __esm({
       baseURL: z.string().url("Must be a valid URL"),
       model: z.string().min(1, "Model name is required"),
       maxSteps: z.number().int().positive().default(50),
-      contextWindowSize: z.number().int().positive().default(20),
+      maxRounds: z.number().int().positive().default(20),
       cdpUrl: z.string().default("http://localhost:9222"),
       timeoutMs: z.number().default(3e5)
     });
@@ -23,15 +23,30 @@ var init_schema = __esm({
 });
 // src/config/loader.ts
+import { readFileSync, existsSync } from "fs";
+import { join } from "path";
+import { homedir } from "os";
+function loadFileConfig() {
+  if (!existsSync(CONFIG_FILE)) return {};
+  try {
+    return JSON.parse(readFileSync(CONFIG_FILE, "utf-8"));
+  } catch {
+    return {};
+  }
+}
+function getConfigPath() {
+  return CONFIG_FILE;
+}
 function loadConfig(overrides) {
+  const file = loadFileConfig();
   const raw = {
-    apiKey: overrides?.apiKey ?? process.env.WINDOWS_USE_API_KEY ?? "",
-    baseURL: overrides?.baseURL ?? process.env.WINDOWS_USE_BASE_URL ?? "",
-    model: overrides?.model ?? process.env.WINDOWS_USE_MODEL ?? "",
-    maxSteps: overrides?.maxSteps ?? intEnv("WINDOWS_USE_MAX_STEPS") ?? 50,
-    contextWindowSize: overrides?.contextWindowSize ?? intEnv("WINDOWS_USE_CONTEXT_WINDOW") ?? 20,
-    cdpUrl: overrides?.cdpUrl ?? process.env.WINDOWS_USE_CDP_URL ?? "http://localhost:9222",
-    timeoutMs: overrides?.timeoutMs ?? intEnv("WINDOWS_USE_TIMEOUT_MS") ?? 3e5
+    apiKey: overrides?.apiKey ?? process.env.WINDOWS_USE_API_KEY ?? file.apiKey ?? "",
+    baseURL: overrides?.baseURL ?? process.env.WINDOWS_USE_BASE_URL ?? file.baseURL ?? "",
+    model: overrides?.model ?? process.env.WINDOWS_USE_MODEL ?? file.model ?? "",
+    maxSteps: overrides?.maxSteps ?? intEnv("WINDOWS_USE_MAX_STEPS") ?? file.maxSteps ?? 50,
+    maxRounds: overrides?.maxRounds ?? intEnv("WINDOWS_USE_MAX_ROUNDS") ?? file.maxRounds ?? 20,
+    cdpUrl: overrides?.cdpUrl ?? process.env.WINDOWS_USE_CDP_URL ?? file.cdpUrl ?? "http://localhost:9222",
+    timeoutMs: overrides?.timeoutMs ?? intEnv("WINDOWS_USE_TIMEOUT_MS") ?? file.timeoutMs ?? 3e5
   };
   return ConfigSchema.parse(raw);
 }
@@ -41,10 +56,12 @@ function intEnv(name) {
   const n = parseInt(val, 10);
   return isNaN(n) ? void 0 : n;
 }
+var CONFIG_FILE;
 var init_loader = __esm({
   "src/config/loader.ts"() {
     "use strict";
     init_schema();
+    CONFIG_FILE = join(homedir(), ".windows-use.json");
   }
 });
@@ -55,23 +72,14 @@ var init_context_manager = __esm({
     "use strict";
     ContextManager = class {
       messages = [];
-      maxMessages;
-      constructor(maxMessages) {
-        this.maxMessages = maxMessages;
-      }
       append(message) {
         this.messages.push(message);
       }
-      /** Returns the system prompt + the most recent messages within the window. */
-      getWindow() {
-        if (this.messages.length === 0) return [];
-        const systemPrompt = this.messages[0]?.role === "system" ? this.messages[0] : null;
-        const nonSystem = systemPrompt ? this.messages.slice(1) : this.messages;
-        const windowSize = this.maxMessages - (systemPrompt ? 1 : 0);
-        const windowed = nonSystem.length > windowSize ? nonSystem.slice(-windowSize) : nonSystem;
-        return systemPrompt ? [systemPrompt, ...windowed] : windowed;
+      /** Returns all messages. */
+      getMessages() {
+        return [...this.messages];
       }
-      /** Total messages stored (before windowing). */
+      /** Total messages stored. */
       get length() {
         return this.messages.length;
       }
@@ -133,7 +141,16 @@ Call \`report\` when:
 - **"blocked"**: You cannot proceed (CAPTCHA, login wall, unexpected error). Explain what's blocking you.
 - **"need_guidance"**: You need a decision or clarification. Describe what you need.
-Calling \`report\` stops your execution. Include a concise summary and optionally a screenshot as evidence.
+Calling \`report\` stops your execution. The \`content\` field supports a rich document format \u2014 mix text with screenshots using \`[Image:img_X]\` markers:
+\`\`\`
+report({
+  status: "completed",
+  content: "Here is what I found:\\n[Image:img_2]\\nThe page shows the search results.\\n[Image:img_3]\\nI also checked the sidebar."
+})
+\`\`\`
+Each screenshot tool returns a screenshot ID (e.g. img_1, img_2). Use these IDs to embed images in your report.
 ## Important
 - Do NOT keep retrying the same failing action. If something fails twice, call \`report\` with status "blocked".
@@ -159,6 +176,8 @@ var init_runner = __esm({
       config;
       toolContext;
       initialized = false;
+      onStep = null;
+      roundsUsed = 0;
       constructor(llmClient, contextManager, toolRegistry, config, toolContext) {
         this.llmClient = llmClient;
         this.contextManager = contextManager;
@@ -166,7 +185,30 @@ var init_runner = __esm({
         this.config = config;
         this.toolContext = toolContext;
       }
+      /** Register a callback to receive step-by-step progress events */
+      setOnStep(cb) {
+        this.onStep = cb;
+      }
+      emit(event) {
+        this.onStep?.(event);
+      }
+      /** How many instruction rounds have been used in this session */
+      get currentRound() {
+        return this.roundsUsed;
+      }
+      /** Whether this session has exhausted its max rounds */
+      get roundsExhausted() {
+        return this.roundsUsed >= this.config.maxRounds;
+      }
       async run(instruction) {
+        if (this.roundsExhausted) {
+          return {
+            status: "blocked",
+            content: `Session has reached the maximum number of instruction rounds (${this.config.maxRounds}). Create a new session to continue.`,
+            stepsUsed: 0
+          };
+        }
+        this.roundsUsed++;
         if (!this.initialized) {
           this.contextManager.append({
             role: "system",
@@ -182,7 +224,7 @@ var init_runner = __esm({
         while (stepsUsed < this.config.maxSteps) {
           stepsUsed++;
           const remaining = this.config.maxSteps - stepsUsed;
-          const messages = this.contextManager.getWindow();
+          const messages = this.contextManager.getMessages();
           if (remaining <= 3 && remaining >= 0) {
             messages.push({
               role: "system",
@@ -195,9 +237,10 @@ var init_runner = __esm({
             response = await this.llmClient.chat(messages, tools);
           } catch (err) {
             const msg = err instanceof Error ? err.message : String(err);
+            this.emit({ type: "error", step: stepsUsed, message: `LLM API error: ${msg}` });
             return {
               status: "blocked",
-              summary: `LLM API error: ${msg}`,
+              content: `LLM API error: ${msg}`,
               stepsUsed
             };
           }
@@ -205,26 +248,31 @@ var init_runner = __esm({
           if (!choice) {
             return {
               status: "blocked",
-              summary: "LLM returned empty response",
+              content: "LLM returned empty response",
               stepsUsed
             };
           }
           const message = choice.message;
+          if (message.content) {
+            this.emit({ type: "thinking", step: stepsUsed, content: message.content });
+          }
           if (choice.finish_reason === "stop" || !message.tool_calls?.length) {
             const text = message.content ?? "";
             this.contextManager.append({ role: "assistant", content: text });
             return {
               status: "need_guidance",
-              summary: text || "Agent stopped without calling report.",
+              content: text || "Agent stopped without calling report.",
               stepsUsed
             };
           }
           this.contextManager.append(message);
           for (const toolCall of message.tool_calls) {
+            const toolName = toolCall.function.name;
             let args;
             try {
               args = JSON.parse(toolCall.function.arguments);
             } catch {
+              this.emit({ type: "error", step: stepsUsed, message: `Failed to parse args for ${toolName}` });
               this.contextManager.append({
                 role: "tool",
                 tool_call_id: toolCall.id,
@@ -232,15 +280,17 @@ var init_runner = __esm({
               });
               continue;
             }
+            this.emit({ type: "tool_call", step: stepsUsed, name: toolName, args });
             let result;
             try {
               result = await this.toolRegistry.execute(
-                toolCall.function.name,
+                toolName,
                 args,
                 this.toolContext
               );
             } catch (err) {
               const msg = err instanceof Error ? err.message : String(err);
+              this.emit({ type: "error", step: stepsUsed, message: `${toolName} failed: ${msg}` });
               this.contextManager.append({
                 role: "tool",
                 tool_call_id: toolCall.id,
@@ -249,6 +299,7 @@ var init_runner = __esm({
               continue;
             }
             if (result.type === "report") {
+              this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: `[${result.status}] report submitted` });
               this.contextManager.append({
                 role: "tool",
                 tool_call_id: toolCall.id,
@@ -256,18 +307,18 @@ var init_runner = __esm({
               });
               return {
                 status: result.status,
-                summary: result.summary,
-                screenshot: result.screenshot,
+                content: result.content,
                 data: result.data,
                 stepsUsed
               };
             }
             if (result.type === "image") {
+              this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: `Screenshot captured (${result.screenshotId})` });
               this.contextManager.append({
                 role: "tool",
                 tool_call_id: toolCall.id,
                 content: [
-                  { type: "text", text: "Screenshot captured." },
+                  { type: "text", text: `Screenshot captured. ID: ${result.screenshotId}` },
                   {
                     type: "image_url",
                     image_url: {
@@ -277,6 +328,8 @@ var init_runner = __esm({
                 ]
               });
             } else {
+              const preview = result.content.length > 200 ? result.content.slice(0, 200) + "..." : result.content;
+              this.emit({ type: "tool_result", step: stepsUsed, name: toolName, result: preview });
               this.contextManager.append({
                 role: "tool",
                 tool_call_id: toolCall.id,
@@ -287,7 +340,7 @@ var init_runner = __esm({
         }
         return {
           status: "blocked",
-          summary: `Exceeded maximum steps limit (${this.config.maxSteps}). Task may be incomplete.`,
+          content: `Exceeded maximum steps limit (${this.config.maxSteps}). Task may be incomplete.`,
           stepsUsed
         };
       }
@@ -296,27 +349,205 @@ var init_runner = __esm({
 });
 // src/tools/browser/client.ts
-var BrowserClient;
+import { existsSync as existsSync2, mkdirSync, cpSync, readdirSync } from "fs";
+import { spawn, execSync } from "child_process";
+import { join as join2 } from "path";
+import { homedir as homedir2 } from "os";
+function findChrome() {
+  for (const p of CHROME_PATHS) {
+    if (p && existsSync2(p)) return p;
+  }
+  return null;
+}
+function findUserDataDir() {
+  const candidates = [
+    // Windows
+    join2(process.env.LOCALAPPDATA ?? "", "Google", "Chrome", "User Data"),
+    // macOS
+    join2(homedir2(), "Library", "Application Support", "Google", "Chrome"),
+    // Linux
+    join2(homedir2(), ".config", "google-chrome"),
+    join2(homedir2(), ".config", "chromium")
+  ];
+  for (const p of candidates) {
+    if (p && existsSync2(p)) return p;
+  }
+  return null;
+}
+function getCdpPort(cdpUrl) {
+  try {
+    return parseInt(new URL(cdpUrl).port, 10) || 9222;
+  } catch {
+    return 9222;
+  }
+}
+function isChromeRunning() {
+  try {
+    if (process.platform === "win32") {
+      const out = execSync('tasklist /FI "IMAGENAME eq chrome.exe" /NH', {
+        encoding: "utf-8",
+        windowsHide: true
+      });
+      return out.includes("chrome.exe");
+    } else {
+      execSync('pgrep -x "chrome|chromium|google-chrome"', { encoding: "utf-8" });
+      return true;
+    }
+  } catch {
+    return false;
+  }
+}
+function syncProfile(sourceDir, targetDir) {
+  mkdirSync(targetDir, { recursive: true });
+  const entries = readdirSync(sourceDir, { withFileTypes: true });
+  for (const entry of entries) {
+    const src = join2(sourceDir, entry.name);
+    const dst = join2(targetDir, entry.name);
+    if (entry.isFile()) {
+      try {
+        cpSync(src, dst, { force: true });
+      } catch {
+      }
+    } else if (entry.isDirectory()) {
+      if (entry.name === "Default" || entry.name.startsWith("Profile ")) {
+        syncProfileDir(src, dst);
+      } else if (!SKIP_DIRS.has(entry.name)) {
+        try {
+          cpSync(src, dst, { recursive: true, force: true });
+        } catch {
+        }
+      }
+    }
+  }
+}
+function syncProfileDir(sourceDir, targetDir) {
+  mkdirSync(targetDir, { recursive: true });
+  let entries;
+  try {
+    entries = readdirSync(sourceDir, { withFileTypes: true });
+  } catch {
+    return;
+  }
+  for (const entry of entries) {
+    if (SKIP_DIRS.has(entry.name)) continue;
+    const src = join2(sourceDir, entry.name);
+    const dst = join2(targetDir, entry.name);
+    try {
+      if (entry.isFile()) {
+        cpSync(src, dst, { force: true });
+      } else if (entry.isDirectory()) {
+        cpSync(src, dst, { recursive: true, force: true });
+      }
+    } catch {
+    }
+  }
+}
+var CHROME_PATHS, SKIP_DIRS, BrowserClient;
 var init_client = __esm({
   "src/tools/browser/client.ts"() {
     "use strict";
+    CHROME_PATHS = [
+      // Windows
+      "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
+      "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
+      `${process.env.LOCALAPPDATA ?? ""}\\Google\\Chrome\\Application\\chrome.exe`,
+      // macOS
+      "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
+      // Linux
+      "/usr/bin/google-chrome",
+      "/usr/bin/google-chrome-stable",
+      "/usr/bin/chromium-browser",
+      "/usr/bin/chromium"
+    ];
+    SKIP_DIRS = /* @__PURE__ */ new Set([
+      "Cache",
+      "Code Cache",
+      "GPUCache",
+      "Service Worker",
+      "CacheStorage",
+      "File System",
+      "blob_storage",
+      "IndexedDB",
+      "DawnCache",
+      "GrShaderCache",
+      "ShaderCache",
+      "optimization_guide_model_store",
+      "BrowserMetrics",
+      "Crashpad",
+      "component_crx_cache"
+    ]);
     BrowserClient = class {
       browser = null;
       context = null;
       _page = null;
       cdpUrl;
+      chromeProcess = null;
       constructor(cdpUrl) {
         this.cdpUrl = cdpUrl;
       }
       async connect() {
         if (this.browser) return;
         const { chromium } = await import("playwright");
-        this.browser = await chromium.connectOverCDP(this.cdpUrl);
+        try {
+          this.browser = await chromium.connectOverCDP(this.cdpUrl);
+        } catch {
+          await this.launchChrome();
+          this.browser = await chromium.connectOverCDP(this.cdpUrl);
+        }
         const contexts = this.browser.contexts();
         this.context = contexts[0] ?? await this.browser.newContext();
         const pages = this.context.pages();
         this._page = pages[0] ?? await this.context.newPage();
       }
+      async launchChrome() {
+        const chromePath = findChrome();
+        if (!chromePath) {
+          throw new Error(
+            "Chrome not found. Please install Chrome or start it manually with: chrome --remote-debugging-port=9222"
+          );
+        }
+        const port = getCdpPort(this.cdpUrl);
+        if (isChromeRunning()) {
+          console.error("[windows-use] Chrome is running without CDP. Restarting with --remote-debugging-port...");
+          try {
+            if (process.platform === "win32") {
+              execSync("taskkill /F /IM chrome.exe /T", { windowsHide: true, stdio: "ignore" });
+            } else {
+              execSync("pkill -f chrome", { stdio: "ignore" });
+            }
+          } catch {
+          }
+          await new Promise((r) => setTimeout(r, 1500));
+        }
+        const targetDir = join2(homedir2(), ".windows-use", "chrome-profile");
+        const userDir = findUserDataDir();
+        if (userDir) {
+          console.error("[windows-use] Syncing Chrome profile (cookies, login state)...");
+          syncProfile(userDir, targetDir);
+          console.error("[windows-use] Profile synced.");
+        } else {
+          mkdirSync(targetDir, { recursive: true });
+        }
+        console.error(`[windows-use] Launching Chrome with --remote-debugging-port=${port}`);
+        this.chromeProcess = spawn(
+          chromePath,
+          [
+            `--remote-debugging-port=${port}`,
+            `--user-data-dir=${targetDir}`
+          ],
+          { detached: true, stdio: "ignore" }
+        );
+        this.chromeProcess.unref();
+        for (let i = 0; i < 30; i++) {
+          try {
+            const res = await fetch(`http://localhost:${port}/json/version`);
+            if (res.ok) return;
+          } catch {
+          }
+          await new Promise((r) => setTimeout(r, 500));
+        }
+        throw new Error("Chrome launched but CDP endpoint did not become available within 15s");
+      }
       async getPage() {
         await this.connect();
         return this._page;
@@ -463,17 +694,79 @@ var init_registry = __esm({
   }
 });
+// src/tools/windows/grid-overlay.ts
+import sharp from "sharp";
+async function addCoordinateGrid(imageBuffer, width, height, options = {}) {
+  const gridSpacing = options.gridSpacing ?? 100;
+  const labelSpacing = options.labelSpacing ?? 200;
+  const majorSpacing = gridSpacing * 5;
+  const svgParts = [];
+  for (let x = gridSpacing; x < width; x += gridSpacing) {
+    const isMajor = x % majorSpacing === 0;
+    const opacity = isMajor ? 0.35 : 0.15;
+    const sw = isMajor ? 1.5 : 0.5;
+    svgParts.push(
+      `<line x1="${x}" y1="0" x2="${x}" y2="${height}" stroke="rgba(255,50,50,${opacity})" stroke-width="${sw}"/>`
+    );
+  }
+  for (let y = gridSpacing; y < height; y += gridSpacing) {
+    const isMajor = y % majorSpacing === 0;
+    const opacity = isMajor ? 0.35 : 0.15;
+    const sw = isMajor ? 1.5 : 0.5;
+    svgParts.push(
+      `<line x1="0" y1="${y}" x2="${width}" y2="${y}" stroke="rgba(255,50,50,${opacity})" stroke-width="${sw}"/>`
+    );
+  }
+  for (let x = labelSpacing; x < width; x += labelSpacing) {
+    const text = String(x);
+    const tw = text.length * 7.5 + 6;
+    svgParts.push(
+      `<rect x="${x - tw / 2}" y="2" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
+      `<text x="${x}" y="14" text-anchor="middle" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${text}</text>`
+    );
+  }
+  for (let y = labelSpacing; y < height; y += labelSpacing) {
+    const text = String(y);
+    const tw = text.length * 7.5 + 6;
+    svgParts.push(
+      `<rect x="2" y="${y - 8}" width="${tw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
+      `<text x="5" y="${y + 4}" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${text}</text>`
+    );
+  }
+  svgParts.push(
+    `<rect x="2" y="2" width="22" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
+    `<text x="5" y="14" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">0,0</text>`
+  );
+  const dimText = `${width}x${height}`;
+  const dimTw = dimText.length * 7.5 + 6;
+  svgParts.push(
+    `<rect x="${width - dimTw - 2}" y="${height - 18}" width="${dimTw}" height="16" fill="rgba(0,0,0,0.65)" rx="3"/>`,
+    `<text x="${width - dimTw / 2 - 2}" y="${height - 6}" text-anchor="middle" fill="#ff6666" font-size="11" font-family="Consolas,monospace" font-weight="bold">${dimText}</text>`
+  );
+  const svg = Buffer.from(
+    `<svg width="${width}" height="${height}" xmlns="http://www.w3.org/2000/svg">${svgParts.join("")}</svg>`
+  );
+  return sharp(imageBuffer).composite([{ input: svg, top: 0, left: 0 }]).jpeg({ quality: 70 }).toBuffer();
+}
+var init_grid_overlay = __esm({
+  "src/tools/windows/grid-overlay.ts"() {
+    "use strict";
+  }
+});
 // src/tools/windows/screenshot.ts
 import { z as z2 } from "zod";
+import sharp2 from "sharp";
 var screenshotTool;
 var init_screenshot = __esm({
   "src/tools/windows/screenshot.ts"() {
     "use strict";
+    init_grid_overlay();
     screenshotTool = {
       name: "screenshot",
-      description: "Capture the full screen and return it as an image. Use this to see what is currently displayed.",
+      description: "Capture the full screen with a coordinate grid overlay. The grid shows pixel coordinates that match mouse_click/mouse_move coordinates. Returns a screenshot ID.",
       parameters: z2.object({}),
-      async execute() {
+      async execute(_args, ctx) {
         const { Monitor } = await import("node-screenshots");
         const monitors = Monitor.all();
         const primary = monitors.find((m) => m.isPrimary()) ?? monitors[0];
@@ -481,11 +774,24 @@ var init_screenshot = __esm({
           return { type: "text", content: "Error: No monitor found" };
         }
         const image = primary.captureImageSync();
-        const buf = image.toPngSync();
+        const physW = image.width;
+        const physH = image.height;
+        const scaleFactor = primary.scaleFactor ?? 1;
+        const logicalW = Math.round(physW / scaleFactor);
+        const logicalH = Math.round(physH / scaleFactor);
+        const raw = image.toRawSync();
+        const resized = await sharp2(raw, {
+          raw: { width: physW, height: physH, channels: 4 }
+        }).resize(logicalW, logicalH).jpeg({ quality: 70 }).toBuffer();
+        const cleanBase64 = resized.toString("base64");
+        const id = ctx.screenshots.save(cleanBase64, "image/jpeg", "desktop");
+        const gridImage = await addCoordinateGrid(resized, logicalW, logicalH);
+        const gridBase64 = gridImage.toString("base64");
         return {
           type: "image",
-          base64: buf.toString("base64"),
-          mimeType: "image/png"
+          base64: gridBase64,
+          mimeType: "image/jpeg",
+          screenshotId: id
         };
       }
     };
@@ -752,8 +1058,47 @@ var init_write = __esm({
   }
 });
-// src/tools/browser/navigate.ts
+// src/tools/file/image.ts
 import { z as z8 } from "zod";
+import { readFileSync as readFileSync2, existsSync as existsSync3 } from "fs";
+import { extname } from "path";
+var IMAGE_EXTS, useLocalImageTool;
+var init_image = __esm({
+  "src/tools/file/image.ts"() {
+    "use strict";
+    IMAGE_EXTS = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".bmp", ".webp"]);
+    useLocalImageTool = {
+      name: "use_local_image",
+      description: "Load a local image file and get a screenshot ID for it. Use this to reference local images in your report via [Image:img_X].",
+      parameters: z8.object({
+        path: z8.string().describe("Absolute path to the image file"),
+        label: z8.string().default("local").describe('Label for the image (e.g. "chart", "photo")')
+      }),
+      async execute(args, ctx) {
+        if (!existsSync3(args.path)) {
+          return { type: "text", content: `Error: File not found: ${args.path}` };
+        }
+        const ext = extname(args.path).toLowerCase();
+        if (!IMAGE_EXTS.has(ext)) {
+          return { type: "text", content: `Error: Not a supported image format (${ext}). Supported: ${[...IMAGE_EXTS].join(", ")}` };
+        }
+        const buf = readFileSync2(args.path);
+        const mimeType = ext === ".png" ? "image/png" : "image/jpeg";
+        const base64 = buf.toString("base64");
+        const id = ctx.screenshots.save(base64, mimeType, args.label);
+        return {
+          type: "image",
+          base64,
+          mimeType,
+          screenshotId: id
+        };
+      }
+    };
+  }
+});
+// src/tools/browser/navigate.ts
+import { z as z9 } from "zod";
 var browserNavigateTool;
 var init_navigate = __esm({
   "src/tools/browser/navigate.ts"() {
@@ -761,8 +1106,8 @@ var init_navigate = __esm({
     browserNavigateTool = {
       name: "browser_navigate",
       description: "Navigate the browser to a URL.",
-      parameters: z8.object({
-        url: z8.string().describe("The URL to navigate to")
+      parameters: z9.object({
+        url: z9.string().describe("The URL to navigate to")
       }),
       async execute(args, ctx) {
         const browser = await ctx.getBrowser();
@@ -777,7 +1122,7 @@ Page title: ${title}` };
 });
 // src/tools/browser/click.ts
-import { z as z9 } from "zod";
+import { z as z10 } from "zod";
 var browserClickTool;
 var init_click = __esm({
   "src/tools/browser/click.ts"() {
@@ -785,8 +1130,8 @@ var init_click = __esm({
     browserClickTool = {
       name: "browser_click",
       description: "Click an element on the web page using a CSS selector or text content.",
-      parameters: z9.object({
-        selector: z9.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
+      parameters: z10.object({
+        selector: z10.string().describe('CSS selector or text to find the element (e.g., "button.submit", "text=Login")')
       }),
       async execute(args, ctx) {
         const browser = await ctx.getBrowser();
@@ -799,7 +1144,7 @@ var init_click = __esm({
 });
 // src/tools/browser/type.ts
-import { z as z10 } from "zod";
+import { z as z11 } from "zod";
 var browserTypeTool;
 var init_type = __esm({
   "src/tools/browser/type.ts"() {
@@ -807,10 +1152,10 @@ var init_type = __esm({
     browserTypeTool = {
       name: "browser_type",
       description: "Type text into an input field on the web page.",
-      parameters: z10.object({
-        selector: z10.string().describe("CSS selector for the input element"),
-        text: z10.string().describe("Text to type"),
-        clear: z10.boolean().default(true).describe("Whether to clear the field before typing")
+      parameters: z11.object({
+        selector: z11.string().describe("CSS selector for the input element"),
+        text: z11.string().describe("Text to type"),
+        clear: z11.boolean().default(true).describe("Whether to clear the field before typing")
       }),
       async execute(args, ctx) {
         const browser = await ctx.getBrowser();
@@ -827,28 +1172,33 @@ var init_type = __esm({
 });
 // src/tools/browser/screenshot.ts
-import { z as z11 } from "zod";
+import { z as z12 } from "zod";
 var browserScreenshotTool;
 var init_screenshot2 = __esm({
   "src/tools/browser/screenshot.ts"() {
     "use strict";
     browserScreenshotTool = {
       name: "browser_screenshot",
-      description: "Take a screenshot of the current browser page.",
-      parameters: z11.object({
-        fullPage: z11.boolean().default(false).describe("Whether to capture the full scrollable page")
+      description: "Take a screenshot of the current browser page. Returns a screenshot ID (e.g. img_2) that you can reference later in report.",
+      parameters: z12.object({
+        fullPage: z12.boolean().default(false).describe("Whether to capture the full scrollable page")
       }),
       async execute(args, ctx) {
         const browser = await ctx.getBrowser();
         const page = await browser.getPage();
         const buf = await page.screenshot({
-          type: "png",
-          fullPage: args.fullPage
+          type: "jpeg",
+          quality: 70,
+          fullPage: args.fullPage,
+          scale: "css"
         });
+        const base64 = buf.toString("base64");
+        const id = ctx.screenshots.save(base64, "image/jpeg", "browser");
         return {
           type: "image",
-          base64: buf.toString("base64"),
-          mimeType: "image/png"
+          base64,
+          mimeType: "image/jpeg",
+          screenshotId: id
         };
       }
     };
@@ -856,7 +1206,7 @@ var init_screenshot2 = __esm({
 });
 // src/tools/browser/content.ts
-import { z as z12 } from "zod";
+import { z as z13 } from "zod";
 var MAX_CONTENT_LENGTH, browserContentTool;
 var init_content = __esm({
   "src/tools/browser/content.ts"() {
@@ -865,7 +1215,7 @@ var init_content = __esm({
     browserContentTool = {
       name: "browser_content",
       description: "Get the text content of the current web page. Returns visible text, not HTML.",
-      parameters: z12.object({}),
+      parameters: z13.object({}),
       async execute(_args, ctx) {
         const browser = await ctx.getBrowser();
         const page = await browser.getPage();
@@ -888,7 +1238,7 @@ ${text}`
 });
 // src/tools/browser/scroll.ts
-import { z as z13 } from "zod";
+import { z as z14 } from "zod";
 var browserScrollTool;
 var init_scroll = __esm({
   "src/tools/browser/scroll.ts"() {
@@ -896,9 +1246,9 @@ var init_scroll = __esm({
     browserScrollTool = {
       name: "browser_scroll",
       description: "Scroll the current web page.",
-      parameters: z13.object({
-        direction: z13.enum(["up", "down"]).describe("Scroll direction"),
-        amount: z13.number().positive().default(500).describe("Pixels to scroll")
+      parameters: z14.object({
+        direction: z14.enum(["up", "down"]).describe("Scroll direction"),
+        amount: z14.number().positive().default(500).describe("Pixels to scroll")
       }),
       async execute(args, ctx) {
         const browser = await ctx.getBrowser();
@@ -912,42 +1262,26 @@ var init_scroll = __esm({
 });
 // src/tools/control/report.ts
-import { z as z14 } from "zod";
+import { z as z15 } from "zod";
 var reportTool;
 var init_report = __esm({
   "src/tools/control/report.ts"() {
     "use strict";
     reportTool = {
       name: "report",
-      description: "Report progress back to the caller. Call this when the task is completed, when you are blocked, or when you need guidance. Calling this STOPS your execution immediately.",
-      parameters: z14.object({
-        status: z14.enum(["completed", "blocked", "need_guidance"]).describe(
+      description: 'Report progress back to the caller. Call this when the task is completed, when you are blocked, or when you need guidance. Calling this STOPS your execution immediately.\n\nThe content field supports rich document format: mix text with screenshots using [Image:img_1] markers. Example:\n"Here is the current state:\n[Image:img_2]\nThe page shows..."',
+      parameters: z15.object({
+        status: z15.enum(["completed", "blocked", "need_guidance"]).describe(
           '"completed" = task done, "blocked" = cannot proceed, "need_guidance" = need a decision'
         ),
-        summary: z14.string().describe("Concise human-readable summary of what was accomplished or what the problem is"),
-        include_screenshot: z14.boolean().default(false).describe("Whether to capture and include a screenshot of the current state"),
-        data: z14.unknown().optional().describe("Optional structured data to return")
+        content: z15.string().describe('Rich report content. Use [Image:img_X] to embed screenshots captured earlier. Example: "Task done.\\n[Image:img_1]\\nThe page shows the result."'),
+        data: z15.unknown().optional().describe("Optional structured data to return")
       }),
       async execute(args) {
-        let screenshot;
-        if (args.include_screenshot) {
-          try {
-            const { Monitor } = await import("node-screenshots");
-            const monitors = Monitor.all();
-            const primary = monitors.find((m) => m.isPrimary()) ?? monitors[0];
-            if (primary) {
-              const image = primary.captureImageSync();
-              const buf = image.toPngSync();
-              screenshot = buf.toString("base64");
-            }
-          } catch {
-          }
-        }
         return {
           type: "report",
           status: args.status,
-          summary: args.summary,
-          screenshot,
+          content: args.content,
           data: args.data
         };
       }
@@ -967,6 +1301,7 @@ function createToolRegistry() {
   registry2.register(runCommandTool);
   registry2.register(fileReadTool);
   registry2.register(fileWriteTool);
+  registry2.register(useLocalImageTool);
   registry2.register(browserNavigateTool);
   registry2.register(browserClickTool);
   registry2.register(browserTypeTool);
@@ -986,6 +1321,7 @@ var init_tools = __esm({
     init_command();
     init_read();
     init_write();
+    init_image();
     init_navigate();
     init_click();
     init_type();
@@ -996,6 +1332,59 @@ var init_tools = __esm({
   }
 });
+// src/tools/types.ts
+function parseReportContent(content, store) {
+  const blocks = [];
+  const regex = /\[Image:(img_\d+)\]/g;
+  let lastIndex = 0;
+  let match;
+  while ((match = regex.exec(content)) !== null) {
+    if (match.index > lastIndex) {
+      blocks.push({ type: "text", text: content.slice(lastIndex, match.index) });
+    }
+    const id = match[1];
+    const screenshot = store.get(id);
+    if (screenshot) {
+      blocks.push({
+        type: "image",
+        id: screenshot.id,
+        base64: screenshot.base64,
+        mimeType: screenshot.mimeType,
+        label: screenshot.label
+      });
+    } else {
+      blocks.push({ type: "text", text: match[0] });
+    }
+    lastIndex = regex.lastIndex;
+  }
+  if (lastIndex < content.length) {
+    blocks.push({ type: "text", text: content.slice(lastIndex) });
+  }
+  return blocks;
+}
+var ScreenshotStore;
+var init_types = __esm({
+  "src/tools/types.ts"() {
+    "use strict";
+    ScreenshotStore = class {
+      counter = 0;
+      store = /* @__PURE__ */ new Map();
+      save(base64, mimeType, label) {
+        this.counter++;
+        const id = `img_${this.counter}`;
+        this.store.set(id, { id, base64, mimeType, label });
+        return id;
+      }
+      get(id) {
+        return this.store.get(id);
+      }
+      listIds() {
+        return [...this.store.keys()];
+      }
+    };
+  }
+});
 // src/mcp/session-registry.ts
 import crypto from "crypto";
 var SessionRegistry;
@@ -1007,20 +1396,23 @@ var init_session_registry = __esm({
     init_runner();
     init_client();
     init_tools();
+    init_types();
     SessionRegistry = class {
       sessions = /* @__PURE__ */ new Map();
       create(config) {
         const id = crypto.randomUUID();
-        const contextManager = new ContextManager(config.contextWindowSize);
+        const contextManager = new ContextManager();
         const llmClient = new LLMClient(config);
         const browserClient = new BrowserClient(config.cdpUrl);
         const toolRegistry = createToolRegistry();
+        const screenshotStore = new ScreenshotStore();
         const toolContext = {
           sessionId: id,
           cdpUrl: config.cdpUrl,
           getBrowser: () => {
             return browserClient.connect().then(() => browserClient);
-          }
+          },
+          screenshots: screenshotStore
         };
         const runner = new AgentRunner(
           llmClient,
@@ -1040,6 +1432,7 @@ var init_session_registry = __esm({
           config,
           runner,
           browserClient,
+          screenshots: screenshotStore,
           timeoutHandle
         };
         this.sessions.set(id, session);
@@ -1076,18 +1469,19 @@ var init_session_registry = __esm({
 });
 // src/mcp/tools.ts
-import { z as z15 } from "zod";
+import { z as z16 } from "zod";
 function registerMcpTools(server2, registry2) {
   server2.tool(
     "create_session",
     "Create a new automation session with a small LLM agent. Returns a session_id.",
     {
-      api_key: z15.string().optional().describe("LLM API key (or set WINDOWS_USE_API_KEY env)"),
-      base_url: z15.string().optional().describe("OpenAI-compatible base URL (or set WINDOWS_USE_BASE_URL env)"),
-      model: z15.string().optional().describe("Model name (or set WINDOWS_USE_MODEL env)"),
-      cdp_url: z15.string().optional().describe("Chrome CDP URL (default: http://localhost:9222)"),
-      timeout_ms: z15.number().optional().describe("Session inactivity timeout in ms (default: 300000)"),
-      max_steps: z15.number().optional().describe("Max tool-calling steps per instruction (default: 50)")
+      api_key: z16.string().optional().describe("LLM API key (or set WINDOWS_USE_API_KEY env)"),
+      base_url: z16.string().optional().describe("OpenAI-compatible base URL (or set WINDOWS_USE_BASE_URL env)"),
+      model: z16.string().optional().describe("Model name (or set WINDOWS_USE_MODEL env)"),
+      cdp_url: z16.string().optional().describe("Chrome CDP URL (default: http://localhost:9222)"),
+      timeout_ms: z16.number().optional().describe("Session inactivity timeout in ms (default: 300000)"),
+      max_steps: z16.number().optional().describe("Max tool-calling steps per instruction (default: 50)"),
+      max_rounds: z16.number().optional().describe("Max instruction rounds per session (default: 20)")
     },
     async (args) => {
       const config = loadConfig({
@@ -1096,7 +1490,8 @@ function registerMcpTools(server2, registry2) {
         model: args.model,
         cdpUrl: args.cdp_url,
         timeoutMs: args.timeout_ms,
-        maxSteps: args.max_steps
+        maxSteps: args.max_steps,
+        maxRounds: args.max_rounds
       });
       const session = registry2.create(config);
       return {
@@ -1111,10 +1506,10 @@ function registerMcpTools(server2, registry2) {
   );
   server2.tool(
     "send_instruction",
-    "Send a task instruction to the agent in a session. The agent executes it and returns a status report.",
+    "Send a task instruction to the agent in a session. The agent executes it and returns a rich report with text and images.",
     {
-      session_id: z15.string().describe("Session ID from create_session"),
-      instruction: z15.string().describe("What you want the agent to do, in natural language")
+      session_id: z16.string().describe("Session ID from create_session"),
+      instruction: z16.string().describe("What you want the agent to do, in natural language")
     },
     async (args) => {
       const session = registry2.get(args.session_id);
@@ -1134,32 +1529,37 @@ function registerMcpTools(server2, registry2) {
       registry2.touch(args.session_id);
       const result = await session.runner.run(args.instruction);
       registry2.touch(args.session_id);
-      const content = [
-        {
-          type: "text",
-          text: JSON.stringify({
-            status: result.status,
-            summary: result.summary,
-            steps_used: result.stepsUsed,
-            ...result.data !== void 0 ? { data: result.data } : {}
-          })
+      const mcpContent = [];
+      mcpContent.push({
+        type: "text",
+        text: JSON.stringify({
+          status: result.status,
+          steps_used: result.stepsUsed,
+          round: session.runner.currentRound,
+          rounds_remaining: session.config.maxRounds - session.runner.currentRound,
+          ...result.data !== void 0 ? { data: result.data } : {}
+        })
+      });
+      const blocks = parseReportContent(result.content, session.screenshots);
+      for (const block of blocks) {
+        if (block.type === "text") {
+          mcpContent.push({ type: "text", text: block.text });
+        } else {
+          mcpContent.push({
+            type: "image",
+            data: block.base64,
+            mimeType: block.mimeType
+          });
         }
-      ];
-      if (result.screenshot) {
-        content.push({
-          type: "image",
-          data: result.screenshot,
-          mimeType: "image/png"
-        });
       }
-      return { content };
+      return { content: mcpContent };
     }
   );
   server2.tool(
     "done_session",
     "Terminate a session and free all resources.",
     {
-      session_id: z15.string().describe("Session ID to terminate")
+      session_id: z16.string().describe("Session ID to terminate")
     },
     async (args) => {
       await registry2.destroy(args.session_id);
@@ -1178,6 +1578,7 @@ var init_tools2 = __esm({
   "src/mcp/tools.ts"() {
     "use strict";
     init_loader();
+    init_types();
   }
 });
@@ -1212,9 +1613,67 @@ var init_server = __esm({
 // src/cli.ts
 init_loader();
 init_session_registry();
+init_types();
 import { program } from "commander";
-program.name("windows-use").description("Run Windows/browser automation tasks using a small LLM agent").argument("[instruction]", "The task to perform").option("--api-key <key>", "LLM API key").option("--base-url <url>", "OpenAI-compatible base URL").option("--model <name>", "Model name").option("--cdp-url <url>", "Chrome CDP URL (default: http://localhost:9222)").option("--max-steps <n>", "Max steps before forced stop", parseInt).option("--mcp", "Start as MCP server instead of running a task").action(async (instruction, opts) => {
-  if (opts.mcp || !instruction) {
+import { createInterface } from "readline";
+import { createServer } from "http";
+import { mkdirSync as mkdirSync2, writeFileSync } from "fs";
+import { join as join3 } from "path";
+import { tmpdir } from "os";
+function startScreenshotServer(screenshotDir) {
+  let counter = 0;
+  const files = /* @__PURE__ */ new Map();
+  return new Promise((resolve) => {
+    const server2 = createServer((req, res) => {
+      const name = req.url?.slice(1) ?? "";
+      const buf = files.get(name);
+      if (buf) {
+        const ct = name.endsWith(".jpg") ? "image/jpeg" : "image/png";
+        res.writeHead(200, { "Content-Type": ct });
+        res.end(buf);
+      } else {
+        res.writeHead(200, { "Content-Type": "text/html" });
+        const links = [...files.keys()].map((f) => `<a href="/${f}"><img src="/${f}" style="max-width:400px;margin:8px"></a>`).join("\n");
+        res.end(`<html><body style="background:#1a1a1a;display:flex;flex-wrap:wrap">${links}</body></html>`);
+      }
+    });
+    server2.listen(0, "127.0.0.1", () => {
+      const addr = server2.address();
+      const port = typeof addr === "object" && addr ? addr.port : 0;
+      const save = (base64) => {
+        counter++;
+        const name = `screenshot-${counter}.jpg`;
+        const buf = Buffer.from(base64, "base64");
+        files.set(name, buf);
+        const filePath = join3(screenshotDir, name);
+        writeFileSync(filePath, buf);
+        return `http://127.0.0.1:${port}/${name}`;
+      };
+      resolve({ port, save });
+    });
+  });
+}
+program.name("windows-use").description("Run Windows/browser automation tasks using a small LLM agent").version("0.2.0");
+program.command("init").description("Interactive setup \u2014 save config to ~/.windows-use.json").action(async () => {
+  const rl = createInterface({ input: process.stdin, output: process.stdout });
+  const ask = (q) => new Promise((resolve) => rl.question(q, (a) => resolve(a.trim())));
+  console.log("\n\u{1F527} windows-use setup\n");
+  const baseURL = await ask("Base URL (OpenAI-compatible endpoint): ");
+  const apiKey = await ask("API Key: ");
+  const model = await ask("Model name (e.g. qwen3.5-flash): ");
+  rl.close();
+  const config = {};
+  if (baseURL) config.baseURL = baseURL;
+  if (apiKey) config.apiKey = apiKey;
+  if (model) config.model = model;
+  const configPath = getConfigPath();
+  writeFileSync(configPath, JSON.stringify(config, null, 2) + "\n", "utf-8");
+  console.log(`
+\u2705 Config saved to ${configPath}`);
+  console.log('You can now run: windows-use "your task here"\n');
+});
+program.argument("[instruction]", "The task to perform").option("--api-key <key>", "LLM API key").option("--base-url <url>", "OpenAI-compatible base URL").option("--model <name>", "Model name").option("--cdp-url <url>", "Chrome CDP URL (default: http://localhost:9222)").option("--max-steps <n>", "Max tool-calling steps per instruction", parseInt).option("--max-rounds <n>", "Max instruction rounds per session", parseInt).option("--mcp", "Start as MCP server instead of running a task").action(async (instruction, opts) => {
+  if (opts.mcp) {
     await init_server().then(() => server_exports);
     return;
   }
@@ -1225,28 +1684,102 @@ program.name("windows-use").description("Run Windows/browser automation tasks us
       baseURL: opts.baseUrl,
       model: opts.model,
       cdpUrl: opts.cdpUrl,
-      maxSteps: opts.maxSteps
+      maxSteps: opts.maxSteps,
+      maxRounds: opts.maxRounds
     });
   } catch (err) {
     console.error(
-      "Configuration error. Set WINDOWS_USE_API_KEY, WINDOWS_USE_BASE_URL, WINDOWS_USE_MODEL env vars or pass --api-key, --base-url, --model flags."
+      "Configuration error. Run `windows-use init` to set up, or pass --api-key, --base-url, --model flags."
     );
     console.error(err instanceof Error ? err.message : err);
     process.exit(1);
   }
+  const screenshotDir = join3(tmpdir(), "windows-use-screenshots");
+  mkdirSync2(screenshotDir, { recursive: true });
+  const { port, save: saveScreenshot } = await startScreenshotServer(screenshotDir);
   const registry2 = new SessionRegistry();
   const session = registry2.create(config);
-  console.error(`[windows-use] Session ${session.id} created`);
-  console.error(`[windows-use] Running: "${instruction}"`);
+  session.runner.setOnStep((event) => {
+    const prefix = `  [step ${event.step}]`;
+    switch (event.type) {
+      case "thinking":
+        console.log(`${prefix} \u{1F4AD} ${event.content}`);
+        break;
+      case "tool_call": {
+        const argsStr = typeof event.args === "object" ? JSON.stringify(event.args, null, 0) : String(event.args);
+        const preview = argsStr.length > 120 ? argsStr.slice(0, 120) + "..." : argsStr;
+        console.log(`${prefix} \u{1F527} ${event.name}(${preview})`);
+        break;
+      }
+      case "tool_result":
+        console.log(`${prefix} \u2713 ${event.name} \u2192 ${event.result}`);
+        break;
+      case "error":
+        console.log(`${prefix} \u2717 ${event.message}`);
+        break;
+    }
+  });
+  console.log(`
+[windows-use] Session ${session.id} created`);
+  console.log(`[windows-use] Model: ${config.model}`);
+  console.log(`[windows-use] Screenshots: http://127.0.0.1:${port}`);
+  console.log(`[windows-use] Type "exit" or Ctrl+C to quit.
+`);
+  const rl = createInterface({ input: process.stdin, output: process.stdout });
+  const ask = (prompt) => new Promise((resolve) => rl.question(prompt, (a) => resolve(a.trim())));
+  let nextInstruction = instruction ?? "";
+  const printResult = (result) => {
+    const statusIcon = result.status === "completed" ? "\u2705" : result.status === "blocked" ? "\u{1F6AB}" : "\u2753";
+    console.log(`
+${statusIcon} [${result.status}]`);
+    const blocks = parseReportContent(result.content, session.screenshots);
+    for (const block of blocks) {
+      if (block.type === "text") {
+        process.stdout.write(block.text);
+      } else {
+        const url = saveScreenshot(block.base64);
+        process.stdout.write(`
+   \u{1F4F8} ${block.label}: ${url}
+`);
+      }
+    }
+    if (result.data) {
+      console.log(`
+   Data: ${JSON.stringify(result.data)}`);
+    }
+    const roundInfo = `round ${session.runner.currentRound}/${config.maxRounds}`;
+    console.log(`
+   (${result.stepsUsed} steps, ${roundInfo})
+`);
+  };
   try {
-    const result = await session.runner.run(instruction);
-    console.log(JSON.stringify(result, null, 2));
-    await registry2.destroy(session.id);
-    process.exit(result.status === "completed" ? 0 : 1);
+    while (true) {
+      if (!nextInstruction) {
+        nextInstruction = await ask("> ");
+      } else {
+        console.log(`> ${nextInstruction}`);
+      }
+      if (!nextInstruction || nextInstruction.toLowerCase() === "exit") {
+        break;
+      }
+      if (session.runner.roundsExhausted) {
+        console.log(`[windows-use] Session reached max rounds (${config.maxRounds}). Type "exit" to quit.
+`);
+        nextInstruction = "";
+        continue;
+      }
+      console.log("[windows-use] Running...\n");
+      const result = await session.runner.run(nextInstruction);
+      printResult(result);
+      nextInstruction = "";
+    }
   } catch (err) {
-    console.error("Fatal error:", err instanceof Error ? err.message : err);
+    console.error("\nFatal error:", err instanceof Error ? err.message : err);
+  } finally {
+    rl.close();
     await registry2.destroyAll();
-    process.exit(1);
+    console.log("[windows-use] Session ended.");
+    process.exit(0);
   }
 });
 program.parse();