npm - @mindstudio-ai/remy - Versions diffs - 0.1.192 → 0.1.194 - Mend

@mindstudio-ai/remy 0.1.192 → 0.1.194

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/headless.js +57 -16
package/dist/index.js +57 -16
package/dist/subagents/browserAutomation/prompt.md +14 -3
package/package.json +1 -1

package/dist/headless.js CHANGED Viewed

@@ -2846,10 +2846,14 @@ async function captureAndAnalyzeScreenshot(promptOrOptions) {
   let onLog;
   let model;
   let path12;
+  let fullPage = true;
   if (typeof promptOrOptions === "object" && promptOrOptions !== null) {
     prompt = promptOrOptions.prompt;
     existingUrl = promptOrOptions.imageUrl;
     path12 = promptOrOptions.path;
+    if (promptOrOptions.fullPage !== void 0) {
+      fullPage = promptOrOptions.fullPage;
+    }
     onLog = promptOrOptions.onLog;
     model = promptOrOptions.model;
   } else {
@@ -2861,9 +2865,9 @@ async function captureAndAnalyzeScreenshot(promptOrOptions) {
     url = existingUrl;
   } else {
     const ssResult = await sidecarRequest(
-      "/screenshot-full-page",
+      fullPage ? "/screenshot-full-page" : "/screenshot-viewport",
       path12 ? { path: path12 } : void 0,
-      { timeout: 12e4 }
+      { timeout: fullPage ? 12e4 : 3e4 }
     );
     url = ssResult?.url || ssResult?.screenshotUrl;
     if (!url) {
@@ -3705,7 +3709,7 @@ var BROWSER_TOOLS = [
                   "screenshotFullPage",
                   "screenshotViewport"
                 ],
-                description: "snapshot: accessibility tree of the page (waits for network to settle). click: click an element (animated cursor, full event sequence). type: type text into input (one char at a time, works with React/Vue/Svelte). select: select a dropdown option by text. wait: wait for an element to appear (polls 100ms, waits for network). navigate: navigate to a URL within the app (waits for load, subsequent steps run on new page). evaluate: run JS in the page. styles: read computed CSS styles from elements (pass properties array with camelCase names, or omit for defaults). screenshotFullPage: full-page viewport-stitched screenshot (returns CDN url with dimensions). screenshotViewport: screenshot of just the visible viewport."
+                description: "snapshot: accessibility tree of the page (waits for network to settle). click: click an element (animated cursor, full event sequence). type: type text into input (one char at a time, works with React/Vue/Svelte). select: select a dropdown option by text. wait: wait for an element to appear (polls 100ms, waits for network). navigate: navigate to a URL within the app (waits for load, subsequent steps run on new page). evaluate: run JS in the page. styles: read computed CSS styles from elements (pass properties array with camelCase names, or omit for defaults). screenshotFullPage: full-page viewport-stitched screenshot (returns CDN url with dimensions). screenshotViewport: screenshot of just the visible viewport \u2014 pass `scrollToSelector` (or `scrollY`) on this step to scroll a section into view and capture it in one atomic step (no separate scroll needed)."
               },
               ref: {
                 type: "string",
@@ -3751,6 +3755,14 @@ var BROWSER_TOOLS = [
                 type: "array",
                 items: { type: "string" },
                 description: 'For styles: camelCase CSS property names to read (e.g., ["backgroundColor", "borderRadius", "fontSize"]). Omit for a default set.'
+              },
+              scrollToSelector: {
+                type: "string",
+                description: "For screenshotViewport: a CSS selector to scroll into view (via the capture\u2019s own context) immediately before the shot, so scroll + capture are atomic. Prefer this over a separate evaluate-scroll step when capturing a specific section."
+              },
+              scrollY: {
+                type: "number",
+                description: "For screenshotViewport: absolute Y offset to scroll to before the shot, when no selector is available."
               }
             },
             required: ["command"]
@@ -3892,9 +3904,10 @@ function resolveModel(surfaceId, models, fallback) {
 // src/subagents/browserAutomation/index.ts
 var log7 = createLogger("browser-automation");
-async function runBrowserAutomation(task, context) {
+async function runBrowserAutomation(task, context, opts) {
   const release = await acquireBrowserLock();
   try {
+    let lastBrowserCommandViewport;
     const result = await runSubAgent({
       system: getBrowserAutomationPrompt(),
       task,
@@ -3920,6 +3933,7 @@ async function runBrowserAutomation(task, context) {
           try {
             return await captureAndAnalyzeScreenshot({
               path: _input.path,
+              fullPage: true,
               onLog,
               model: resolveModel(
                 "imageAnalysis",
@@ -3952,6 +3966,11 @@ async function runBrowserAutomation(task, context) {
               (s) => s.command === "screenshotViewport" && s.result?.url
             );
             if (screenshotSteps.length > 0) {
+              const lastStep = screenshotSteps[screenshotSteps.length - 1];
+              lastBrowserCommandViewport = {
+                url: lastStep.result.url,
+                styleMap: lastStep.result.styleMap
+              };
               const visionOverride = {
                 model: resolveModel(
                   "imageAnalysis",
@@ -3998,10 +4017,12 @@ async function runBrowserAutomation(task, context) {
       captureArtifacts: ["screenshotFullPage"]
     });
     context.subAgentMessages?.set(context.toolCallId, result.messages);
-    const ss = result.artifacts?.screenshotFullPage;
+    const fullPage = result.artifacts?.screenshotFullPage;
+    const viewport = lastBrowserCommandViewport;
+    const preferred = opts?.capture === "viewport" ? viewport ?? fullPage : fullPage ?? viewport;
     return {
       text: result.text,
-      ...ss?.url ? { screenshot: { url: ss.url, styleMap: ss.styleMap } } : {}
+      ...preferred?.url ? { screenshot: { url: preferred.url, styleMap: preferred.styleMap } } : {}
     };
   } finally {
     release();
@@ -4042,10 +4063,14 @@ var screenshotTool = {
   clearable: true,
   definition: {
     name: "screenshot",
-    description: "Capture a full-height screenshot of the app preview and get a description of what's on screen. Captures the settled page state \u2014 it cannot catch animations, transitions, or transient state. Optionally provide specific questions about what you're looking for. Use a bulleted list to ask many questions at once. To ask additional questions about a screenshot you have already captured, pass its URL as imageUrl to skip recapture. If the screenshot requires interaction first (logging in, clicking a tab, dismissing a modal), use the instructions param to describe the steps.",
+    description: "Capture a screenshot of the app preview and get a description of what's on screen. Choose `fullPage`: `false` captures just the visible viewport (fast \u2014 for a specific section the page is scrolled to), `true` captures the entire page top-to-bottom (slower \u2014 for overall composition or content past the fold). Captures the settled page state \u2014 it cannot catch animations, transitions, or transient state. Optionally provide specific questions about what you're looking for. Use a bulleted list to ask many questions at once. To ask additional questions about a screenshot you have already captured, pass its URL as imageUrl to skip recapture. If the screenshot requires interaction first (logging in, clicking a tab, dismissing a modal, scrolling to a section), use the instructions param to describe the steps.",
     inputSchema: {
       type: "object",
       properties: {
+        fullPage: {
+          type: "boolean",
+          description: "true = full-height capture of the entire page; false = just the visible viewport. Pick based on whether you need the whole page or a specific section."
+        },
         prompt: {
           type: "string",
           description: "Optional question about the screenshot. If omitted, returns a general description of what's visible."
@@ -4060,12 +4085,15 @@ var screenshotTool = {
         },
         instructions: {
           type: "string",
-          description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions before capturing the screenshot - it can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start its test at. You will always get back a full-height screenshot of the entire page. Do not attempt to scroll or capture specific areas. Never describe what names or values to use when applying the instructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing. Use only when interaction is required to *reach* the state you want to capture \u2014 log in, dismiss a modal, switch a tab, follow a route. If your steps are exercising the app's functionality across multiple states (running flows, asserting behavior under interaction, multi-step QA), use `runAutomatedBrowserTest` instead."
+          description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow, scrolling to a section, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions, then capture per your `fullPage` choice \u2014 so with `fullPage: false` you can scroll to a section and capture just that viewport. It can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start its test at. Never describe what names or values to use when applying the instructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing. Use only when interaction is required to *reach* the state you want to capture \u2014 log in, dismiss a modal, switch a tab, follow a route, scroll to a section. If your steps are exercising the app's functionality across multiple states (running flows, asserting behavior under interaction, multi-step QA), use `runAutomatedBrowserTest` instead."
         }
-      }
+      },
+      required: ["fullPage"]
     }
   },
   async execute(input, context) {
+    const fullPage = input.fullPage === true;
+    const shotKind = fullPage ? "full-page" : "viewport";
     try {
       if (input.imageUrl) {
         return await captureAndAnalyzeScreenshot({
@@ -4076,8 +4104,10 @@ var screenshotTool = {
         });
       }
       if (input.instructions && context) {
-        const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a full-page screenshot.` : `${input.instructions}. After completing these steps, take a full-page screenshot.`;
-        const result = await runBrowserAutomation(task, context);
+        const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a ${shotKind} screenshot.` : `${input.instructions}. After completing these steps, take a ${shotKind} screenshot.`;
+        const result = await runBrowserAutomation(task, context, {
+          capture: fullPage ? "fullPage" : "viewport"
+        });
         if (!result.screenshot) {
           return result.text;
         }
@@ -4094,6 +4124,7 @@ var screenshotTool = {
         return await captureAndAnalyzeScreenshot({
           prompt: input.prompt,
           path: input.path,
+          fullPage,
           onLog: context?.onLog,
           model: resolveModel("imageAnalysis", context?.models, context?.model)
         });
@@ -4393,10 +4424,14 @@ __export(screenshot_exports, {
 var definition5 = {
   clearable: true,
   name: "screenshot",
-  description: "Capture a full-height screenshot of the current app preview. Returns a CDN URL along with visual analysis. Use to review the current state of the UI being built. Remember, the screenshot analysis is not overly precise - for example, it cannot reliably identify specific fonts by name \u2014 it can only describe what letterforms look like.",
+  description: "Capture a screenshot of the current app preview and get it back with visual analysis. Choose `fullPage`: `false` captures just the visible viewport (fast \u2014 use it to review a specific section the page is scrolled to), `true` captures the entire page top-to-bottom (slower \u2014 use it to review overall composition or a layout you can't see in one screen). Use to review the current state of the UI being built. Remember, the screenshot analysis is not overly precise - for example, it cannot reliably identify specific fonts by name \u2014 it can only describe what letterforms look like.",
   inputSchema: {
     type: "object",
     properties: {
+      fullPage: {
+        type: "boolean",
+        description: "true = full-height capture of the entire page; false = just the visible viewport. Pick based on whether you need the whole page or a specific section."
+      },
       prompt: {
         type: "string",
         description: "Optional specific question about the screenshot. Use a bulleted list to ask many questions at once."
@@ -4407,16 +4442,21 @@ var definition5 = {
       },
       instructions: {
         type: "string",
-        description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions before capturing the screenshot - it can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start its test at. You will always get back a full-height screenshot of the entire page. Do not attempt to scroll or capture specific areas. Only use instructions when you need to trigger stateful changes. Never describe what names or values to use when applying the isntructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing."
+        description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, scrolling to a specific section, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions, then capture per your `fullPage` choice \u2014 so with `fullPage: false` you can scroll to a section and capture just that viewport. It can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start at. Never describe what names or values to use when applying the instructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing."
       }
-    }
+    },
+    required: ["fullPage"]
   }
 };
 async function execute5(input, onLog, context) {
+  const fullPage = input.fullPage === true;
+  const shotKind = fullPage ? "full-page" : "viewport";
   if (input.instructions && context) {
     try {
-      const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a full-page screenshot.` : `${input.instructions}. After completing these steps, take a full-page screenshot.`;
-      const result = await runBrowserAutomation(task, context);
+      const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a ${shotKind} screenshot.` : `${input.instructions}. After completing these steps, take a ${shotKind} screenshot.`;
+      const result = await runBrowserAutomation(task, context, {
+        capture: fullPage ? "fullPage" : "viewport"
+      });
       if (!result.screenshot) {
         return result.text;
       }
@@ -4436,6 +4476,7 @@ async function execute5(input, onLog, context) {
     return await captureAndAnalyzeScreenshot({
       prompt: input.prompt,
       path: input.path,
+      fullPage,
       onLog,
       model: resolveModel("imageAnalysis", context?.models, context?.model)
     });

package/dist/index.js CHANGED Viewed

@@ -3247,10 +3247,14 @@ async function captureAndAnalyzeScreenshot(promptOrOptions) {
   let onLog;
   let model;
   let path13;
+  let fullPage = true;
   if (typeof promptOrOptions === "object" && promptOrOptions !== null) {
     prompt = promptOrOptions.prompt;
     existingUrl = promptOrOptions.imageUrl;
     path13 = promptOrOptions.path;
+    if (promptOrOptions.fullPage !== void 0) {
+      fullPage = promptOrOptions.fullPage;
+    }
     onLog = promptOrOptions.onLog;
     model = promptOrOptions.model;
   } else {
@@ -3262,9 +3266,9 @@ async function captureAndAnalyzeScreenshot(promptOrOptions) {
     url = existingUrl;
   } else {
     const ssResult = await sidecarRequest(
-      "/screenshot-full-page",
+      fullPage ? "/screenshot-full-page" : "/screenshot-viewport",
       path13 ? { path: path13 } : void 0,
-      { timeout: 12e4 }
+      { timeout: fullPage ? 12e4 : 3e4 }
     );
     url = ssResult?.url || ssResult?.screenshotUrl;
     if (!url) {
@@ -4163,7 +4167,7 @@ var init_tools = __esm({
                       "screenshotFullPage",
                       "screenshotViewport"
                     ],
-                    description: "snapshot: accessibility tree of the page (waits for network to settle). click: click an element (animated cursor, full event sequence). type: type text into input (one char at a time, works with React/Vue/Svelte). select: select a dropdown option by text. wait: wait for an element to appear (polls 100ms, waits for network). navigate: navigate to a URL within the app (waits for load, subsequent steps run on new page). evaluate: run JS in the page. styles: read computed CSS styles from elements (pass properties array with camelCase names, or omit for defaults). screenshotFullPage: full-page viewport-stitched screenshot (returns CDN url with dimensions). screenshotViewport: screenshot of just the visible viewport."
+                    description: "snapshot: accessibility tree of the page (waits for network to settle). click: click an element (animated cursor, full event sequence). type: type text into input (one char at a time, works with React/Vue/Svelte). select: select a dropdown option by text. wait: wait for an element to appear (polls 100ms, waits for network). navigate: navigate to a URL within the app (waits for load, subsequent steps run on new page). evaluate: run JS in the page. styles: read computed CSS styles from elements (pass properties array with camelCase names, or omit for defaults). screenshotFullPage: full-page viewport-stitched screenshot (returns CDN url with dimensions). screenshotViewport: screenshot of just the visible viewport \u2014 pass `scrollToSelector` (or `scrollY`) on this step to scroll a section into view and capture it in one atomic step (no separate scroll needed)."
                   },
                   ref: {
                     type: "string",
@@ -4209,6 +4213,14 @@ var init_tools = __esm({
                     type: "array",
                     items: { type: "string" },
                     description: 'For styles: camelCase CSS property names to read (e.g., ["backgroundColor", "borderRadius", "fontSize"]). Omit for a default set.'
+                  },
+                  scrollToSelector: {
+                    type: "string",
+                    description: "For screenshotViewport: a CSS selector to scroll into view (via the capture\u2019s own context) immediately before the shot, so scroll + capture are atomic. Prefer this over a separate evaluate-scroll step when capturing a specific section."
+                  },
+                  scrollY: {
+                    type: "number",
+                    description: "For screenshotViewport: absolute Y offset to scroll to before the shot, when no selector is available."
                   }
                 },
                 required: ["command"]
@@ -4263,9 +4275,10 @@ var init_prompt2 = __esm({
 });
 // src/subagents/browserAutomation/index.ts
-async function runBrowserAutomation(task, context) {
+async function runBrowserAutomation(task, context, opts) {
   const release = await acquireBrowserLock();
   try {
+    let lastBrowserCommandViewport;
     const result = await runSubAgent({
       system: getBrowserAutomationPrompt(),
       task,
@@ -4291,6 +4304,7 @@ async function runBrowserAutomation(task, context) {
           try {
             return await captureAndAnalyzeScreenshot({
               path: _input.path,
+              fullPage: true,
               onLog,
               model: resolveModel(
                 "imageAnalysis",
@@ -4323,6 +4337,11 @@ async function runBrowserAutomation(task, context) {
               (s) => s.command === "screenshotViewport" && s.result?.url
             );
             if (screenshotSteps.length > 0) {
+              const lastStep = screenshotSteps[screenshotSteps.length - 1];
+              lastBrowserCommandViewport = {
+                url: lastStep.result.url,
+                styleMap: lastStep.result.styleMap
+              };
               const visionOverride = {
                 model: resolveModel(
                   "imageAnalysis",
@@ -4369,10 +4388,12 @@ async function runBrowserAutomation(task, context) {
       captureArtifacts: ["screenshotFullPage"]
     });
     context.subAgentMessages?.set(context.toolCallId, result.messages);
-    const ss = result.artifacts?.screenshotFullPage;
+    const fullPage = result.artifacts?.screenshotFullPage;
+    const viewport = lastBrowserCommandViewport;
+    const preferred = opts?.capture === "viewport" ? viewport ?? fullPage : fullPage ?? viewport;
     return {
       text: result.text,
-      ...ss?.url ? { screenshot: { url: ss.url, styleMap: ss.styleMap } } : {}
+      ...preferred?.url ? { screenshot: { url: preferred.url, styleMap: preferred.styleMap } } : {}
     };
   } finally {
     release();
@@ -4437,10 +4458,14 @@ var init_screenshot2 = __esm({
       clearable: true,
       definition: {
         name: "screenshot",
-        description: "Capture a full-height screenshot of the app preview and get a description of what's on screen. Captures the settled page state \u2014 it cannot catch animations, transitions, or transient state. Optionally provide specific questions about what you're looking for. Use a bulleted list to ask many questions at once. To ask additional questions about a screenshot you have already captured, pass its URL as imageUrl to skip recapture. If the screenshot requires interaction first (logging in, clicking a tab, dismissing a modal), use the instructions param to describe the steps.",
+        description: "Capture a screenshot of the app preview and get a description of what's on screen. Choose `fullPage`: `false` captures just the visible viewport (fast \u2014 for a specific section the page is scrolled to), `true` captures the entire page top-to-bottom (slower \u2014 for overall composition or content past the fold). Captures the settled page state \u2014 it cannot catch animations, transitions, or transient state. Optionally provide specific questions about what you're looking for. Use a bulleted list to ask many questions at once. To ask additional questions about a screenshot you have already captured, pass its URL as imageUrl to skip recapture. If the screenshot requires interaction first (logging in, clicking a tab, dismissing a modal, scrolling to a section), use the instructions param to describe the steps.",
         inputSchema: {
           type: "object",
           properties: {
+            fullPage: {
+              type: "boolean",
+              description: "true = full-height capture of the entire page; false = just the visible viewport. Pick based on whether you need the whole page or a specific section."
+            },
             prompt: {
               type: "string",
               description: "Optional question about the screenshot. If omitted, returns a general description of what's visible."
@@ -4455,12 +4480,15 @@ var init_screenshot2 = __esm({
             },
             instructions: {
               type: "string",
-              description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions before capturing the screenshot - it can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start its test at. You will always get back a full-height screenshot of the entire page. Do not attempt to scroll or capture specific areas. Never describe what names or values to use when applying the instructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing. Use only when interaction is required to *reach* the state you want to capture \u2014 log in, dismiss a modal, switch a tab, follow a route. If your steps are exercising the app's functionality across multiple states (running flows, asserting behavior under interaction, multi-step QA), use `runAutomatedBrowserTest` instead."
+              description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow, scrolling to a section, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions, then capture per your `fullPage` choice \u2014 so with `fullPage: false` you can scroll to a section and capture just that viewport. It can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start its test at. Never describe what names or values to use when applying the instructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing. Use only when interaction is required to *reach* the state you want to capture \u2014 log in, dismiss a modal, switch a tab, follow a route, scroll to a section. If your steps are exercising the app's functionality across multiple states (running flows, asserting behavior under interaction, multi-step QA), use `runAutomatedBrowserTest` instead."
             }
-          }
+          },
+          required: ["fullPage"]
         }
       },
       async execute(input, context) {
+        const fullPage = input.fullPage === true;
+        const shotKind = fullPage ? "full-page" : "viewport";
         try {
           if (input.imageUrl) {
             return await captureAndAnalyzeScreenshot({
@@ -4471,8 +4499,10 @@ var init_screenshot2 = __esm({
             });
           }
           if (input.instructions && context) {
-            const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a full-page screenshot.` : `${input.instructions}. After completing these steps, take a full-page screenshot.`;
-            const result = await runBrowserAutomation(task, context);
+            const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a ${shotKind} screenshot.` : `${input.instructions}. After completing these steps, take a ${shotKind} screenshot.`;
+            const result = await runBrowserAutomation(task, context, {
+              capture: fullPage ? "fullPage" : "viewport"
+            });
             if (!result.screenshot) {
               return result.text;
             }
@@ -4489,6 +4519,7 @@ var init_screenshot2 = __esm({
             return await captureAndAnalyzeScreenshot({
               prompt: input.prompt,
               path: input.path,
+              fullPage,
               onLog: context?.onLog,
               model: resolveModel("imageAnalysis", context?.models, context?.model)
             });
@@ -4826,10 +4857,14 @@ __export(screenshot_exports, {
   execute: () => execute5
 });
 async function execute5(input, onLog, context) {
+  const fullPage = input.fullPage === true;
+  const shotKind = fullPage ? "full-page" : "viewport";
   if (input.instructions && context) {
     try {
-      const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a full-page screenshot.` : `${input.instructions}. After completing these steps, take a full-page screenshot.`;
-      const result = await runBrowserAutomation(task, context);
+      const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a ${shotKind} screenshot.` : `${input.instructions}. After completing these steps, take a ${shotKind} screenshot.`;
+      const result = await runBrowserAutomation(task, context, {
+        capture: fullPage ? "fullPage" : "viewport"
+      });
       if (!result.screenshot) {
         return result.text;
       }
@@ -4849,6 +4884,7 @@ async function execute5(input, onLog, context) {
     return await captureAndAnalyzeScreenshot({
       prompt: input.prompt,
       path: input.path,
+      fullPage,
       onLog,
       model: resolveModel("imageAnalysis", context?.models, context?.model)
     });
@@ -4869,10 +4905,14 @@ var init_screenshot3 = __esm({
     definition5 = {
       clearable: true,
       name: "screenshot",
-      description: "Capture a full-height screenshot of the current app preview. Returns a CDN URL along with visual analysis. Use to review the current state of the UI being built. Remember, the screenshot analysis is not overly precise - for example, it cannot reliably identify specific fonts by name \u2014 it can only describe what letterforms look like.",
+      description: "Capture a screenshot of the current app preview and get it back with visual analysis. Choose `fullPage`: `false` captures just the visible viewport (fast \u2014 use it to review a specific section the page is scrolled to), `true` captures the entire page top-to-bottom (slower \u2014 use it to review overall composition or a layout you can't see in one screen). Use to review the current state of the UI being built. Remember, the screenshot analysis is not overly precise - for example, it cannot reliably identify specific fonts by name \u2014 it can only describe what letterforms look like.",
       inputSchema: {
         type: "object",
         properties: {
+          fullPage: {
+            type: "boolean",
+            description: "true = full-height capture of the entire page; false = just the visible viewport. Pick based on whether you need the whole page or a specific section."
+          },
           prompt: {
             type: "string",
             description: "Optional specific question about the screenshot. Use a bulleted list to ask many questions at once."
@@ -4883,9 +4923,10 @@ var init_screenshot3 = __esm({
           },
           instructions: {
             type: "string",
-            description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions before capturing the screenshot - it can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start its test at. You will always get back a full-height screenshot of the entire page. Do not attempt to scroll or capture specific areas. Only use instructions when you need to trigger stateful changes. Never describe what names or values to use when applying the isntructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing."
+            description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, scrolling to a specific section, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions, then capture per your `fullPage` choice \u2014 so with `fullPage: false` you can scroll to a section and capture just that viewport. It can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start at. Never describe what names or values to use when applying the instructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing."
           }
-        }
+        },
+        required: ["fullPage"]
       }
     };
   }

package/dist/subagents/browserAutomation/prompt.md CHANGED Viewed

@@ -43,7 +43,7 @@ Note: the snapshot concatenates inline text and strips whitespace. If you need t
 - `navigate`: Navigate to a new URL within the app. Waits for the new page to load before continuing with subsequent steps. Use this instead of evaluate with `window.location.href` when you need to navigate and then continue interacting with the new page. Steps after navigate execute on the new page automatically.
 - `evaluate`: Run arbitrary JavaScript in the page and return the result.
 - `styles`: Read computed CSS styles from page elements. Pass a `properties` array with camelCase CSS property names (e.g., `["backgroundColor", "borderRadius", "fontSize"]`). Omit `properties` for a default set covering colors, typography, spacing, borders, shadows, dimensions, and layout. Uses the same targeting as click/type (ref, text, role, label, selector). Omit the target to get styles for all elements from the last snapshot.
-- `screenshotViewport`: Take a screenshot of the current viewport. Returns CDN url with full text analysis and dimensions. Useful at the end of an action batch to visually see things like layout shift or overflow. Do not use if you can get what you need with other tools - only use when you need to visually see the viewport.
+- `screenshotViewport`: Take a screenshot of the visible viewport. Returns CDN url with full text analysis and dimensions. To capture a specific section, set `scrollToSelector` (a CSS selector) — or `scrollY` (an absolute offset) — on this same step; it scrolls the target into view and captures it atomically, so you do NOT need a separate scroll step. Do not use if you can get what you need with other tools - only use when you need to visually see the viewport.
 ### Element targeting (tried in order)
@@ -109,6 +109,15 @@ Select a dropdown option and screenshot the result:
 }
 ```
+Capture a specific below-the-fold section (scroll + capture in one atomic step):
+```json
+{
+  "steps": [
+    { "command": "screenshotViewport", "scrollToSelector": "#pricing" }
+  ]
+}
+```
 Navigate to a sub-page and interact with it:
 ```json
 {
@@ -139,8 +148,10 @@ Check a count with evaluate:
 ```
 </examples>
-### Full Page Screenshot
-You can use the `screenshotFullPage` tool to take a full-height screenshot of the current page. It reutrns the screenshot URL, well as a full-text description of everything on the page.
+### Final Screenshot
+How you take the final screenshot depends on what the task asked for:
+- **Whole page** → use the standalone `screenshotFullPage` tool. It takes a full-height screenshot of the current page and returns the URL plus a full-text description.
+- **A specific section / viewport** → use a `browserCommand` batch ending in a `screenshotViewport` step with `scrollToSelector` set to the section (e.g. `{ "command": "screenshotViewport", "scrollToSelector": "#pricing" }`). This scrolls the section into view and captures it in one atomic step. Do this rather than a separate scroll step followed by a capture — capturing the viewport is only reliable when the scroll and the shot are in the same step.
 <rules>
   - Always batch steps into a single browserCommand call. Don't send one step per turn. Type + click + wait should be one call, not three separate turns.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@mindstudio-ai/remy",
-  "version": "0.1.192",
+  "version": "0.1.194",
   "description": "MindStudio coding agent",
   "repository": {
     "type": "git",