npm - @mindstudio-ai/remy - Versions diffs - 0.1.193 → 0.1.194 - Mend

@mindstudio-ai/remy 0.1.193 → 0.1.194

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/headless.js +19 -19
package/dist/index.js +19 -19
package/dist/subagents/browserAutomation/prompt.md +13 -2
package/package.json +1 -1

package/dist/headless.js CHANGED Viewed

@@ -3709,7 +3709,7 @@ var BROWSER_TOOLS = [
                   "screenshotFullPage",
                   "screenshotViewport"
                 ],
-                description: "snapshot: accessibility tree of the page (waits for network to settle). click: click an element (animated cursor, full event sequence). type: type text into input (one char at a time, works with React/Vue/Svelte). select: select a dropdown option by text. wait: wait for an element to appear (polls 100ms, waits for network). navigate: navigate to a URL within the app (waits for load, subsequent steps run on new page). evaluate: run JS in the page. styles: read computed CSS styles from elements (pass properties array with camelCase names, or omit for defaults). screenshotFullPage: full-page viewport-stitched screenshot (returns CDN url with dimensions). screenshotViewport: screenshot of just the visible viewport."
+                description: "snapshot: accessibility tree of the page (waits for network to settle). click: click an element (animated cursor, full event sequence). type: type text into input (one char at a time, works with React/Vue/Svelte). select: select a dropdown option by text. wait: wait for an element to appear (polls 100ms, waits for network). navigate: navigate to a URL within the app (waits for load, subsequent steps run on new page). evaluate: run JS in the page. styles: read computed CSS styles from elements (pass properties array with camelCase names, or omit for defaults). screenshotFullPage: full-page viewport-stitched screenshot (returns CDN url with dimensions). screenshotViewport: screenshot of just the visible viewport \u2014 pass `scrollToSelector` (or `scrollY`) on this step to scroll a section into view and capture it in one atomic step (no separate scroll needed)."
               },
               ref: {
                 type: "string",
@@ -3755,6 +3755,14 @@ var BROWSER_TOOLS = [
                 type: "array",
                 items: { type: "string" },
                 description: 'For styles: camelCase CSS property names to read (e.g., ["backgroundColor", "borderRadius", "fontSize"]). Omit for a default set.'
+              },
+              scrollToSelector: {
+                type: "string",
+                description: "For screenshotViewport: a CSS selector to scroll into view (via the capture\u2019s own context) immediately before the shot, so scroll + capture are atomic. Prefer this over a separate evaluate-scroll step when capturing a specific section."
+              },
+              scrollY: {
+                type: "number",
+                description: "For screenshotViewport: absolute Y offset to scroll to before the shot, when no selector is available."
               }
             },
             required: ["command"]
@@ -3777,20 +3785,6 @@ var BROWSER_TOOLS = [
         }
       }
     }
-  },
-  {
-    clearable: true,
-    name: "screenshotViewport",
-    description: "Capture a screenshot of just the visible viewport (no full-page scroll/stitch). Returns a CDN URL with full text analysis and description. Use this when the goal is a specific section the page is currently scrolled to, rather than the whole page.",
-    inputSchema: {
-      type: "object",
-      properties: {
-        path: {
-          type: "string",
-          description: 'Navigate to this path before capturing (e.g. "/settings"). If omitted, screenshots the current page.'
-        }
-      }
-    }
   }
 ];
 var BROWSER_EXTERNAL_TOOLS = /* @__PURE__ */ new Set(["browserCommand"]);
@@ -3913,6 +3907,7 @@ var log7 = createLogger("browser-automation");
 async function runBrowserAutomation(task, context, opts) {
   const release = await acquireBrowserLock();
   try {
+    let lastBrowserCommandViewport;
     const result = await runSubAgent({
       system: getBrowserAutomationPrompt(),
       task,
@@ -3934,11 +3929,11 @@ async function runBrowserAutomation(task, context, opts) {
             return `Error setting up browser: ${err.message}`;
           }
         }
-        if (name === "screenshotFullPage" || name === "screenshotViewport") {
+        if (name === "screenshotFullPage") {
           try {
             return await captureAndAnalyzeScreenshot({
               path: _input.path,
-              fullPage: name === "screenshotFullPage",
+              fullPage: true,
               onLog,
               model: resolveModel(
                 "imageAnalysis",
@@ -3971,6 +3966,11 @@ async function runBrowserAutomation(task, context, opts) {
               (s) => s.command === "screenshotViewport" && s.result?.url
             );
             if (screenshotSteps.length > 0) {
+              const lastStep = screenshotSteps[screenshotSteps.length - 1];
+              lastBrowserCommandViewport = {
+                url: lastStep.result.url,
+                styleMap: lastStep.result.styleMap
+              };
               const visionOverride = {
                 model: resolveModel(
                   "imageAnalysis",
@@ -4014,11 +4014,11 @@ async function runBrowserAutomation(task, context, opts) {
         return result2;
       },
       toolRegistry: context.toolRegistry,
-      captureArtifacts: ["screenshotFullPage", "screenshotViewport"]
+      captureArtifacts: ["screenshotFullPage"]
     });
     context.subAgentMessages?.set(context.toolCallId, result.messages);
-    const viewport = result.artifacts?.screenshotViewport;
     const fullPage = result.artifacts?.screenshotFullPage;
+    const viewport = lastBrowserCommandViewport;
     const preferred = opts?.capture === "viewport" ? viewport ?? fullPage : fullPage ?? viewport;
     return {
       text: result.text,

package/dist/index.js CHANGED Viewed

@@ -4167,7 +4167,7 @@ var init_tools = __esm({
                       "screenshotFullPage",
                       "screenshotViewport"
                     ],
-                    description: "snapshot: accessibility tree of the page (waits for network to settle). click: click an element (animated cursor, full event sequence). type: type text into input (one char at a time, works with React/Vue/Svelte). select: select a dropdown option by text. wait: wait for an element to appear (polls 100ms, waits for network). navigate: navigate to a URL within the app (waits for load, subsequent steps run on new page). evaluate: run JS in the page. styles: read computed CSS styles from elements (pass properties array with camelCase names, or omit for defaults). screenshotFullPage: full-page viewport-stitched screenshot (returns CDN url with dimensions). screenshotViewport: screenshot of just the visible viewport."
+                    description: "snapshot: accessibility tree of the page (waits for network to settle). click: click an element (animated cursor, full event sequence). type: type text into input (one char at a time, works with React/Vue/Svelte). select: select a dropdown option by text. wait: wait for an element to appear (polls 100ms, waits for network). navigate: navigate to a URL within the app (waits for load, subsequent steps run on new page). evaluate: run JS in the page. styles: read computed CSS styles from elements (pass properties array with camelCase names, or omit for defaults). screenshotFullPage: full-page viewport-stitched screenshot (returns CDN url with dimensions). screenshotViewport: screenshot of just the visible viewport \u2014 pass `scrollToSelector` (or `scrollY`) on this step to scroll a section into view and capture it in one atomic step (no separate scroll needed)."
                   },
                   ref: {
                     type: "string",
@@ -4213,6 +4213,14 @@ var init_tools = __esm({
                     type: "array",
                     items: { type: "string" },
                     description: 'For styles: camelCase CSS property names to read (e.g., ["backgroundColor", "borderRadius", "fontSize"]). Omit for a default set.'
+                  },
+                  scrollToSelector: {
+                    type: "string",
+                    description: "For screenshotViewport: a CSS selector to scroll into view (via the capture\u2019s own context) immediately before the shot, so scroll + capture are atomic. Prefer this over a separate evaluate-scroll step when capturing a specific section."
+                  },
+                  scrollY: {
+                    type: "number",
+                    description: "For screenshotViewport: absolute Y offset to scroll to before the shot, when no selector is available."
                   }
                 },
                 required: ["command"]
@@ -4235,20 +4243,6 @@ var init_tools = __esm({
             }
           }
         }
-      },
-      {
-        clearable: true,
-        name: "screenshotViewport",
-        description: "Capture a screenshot of just the visible viewport (no full-page scroll/stitch). Returns a CDN URL with full text analysis and description. Use this when the goal is a specific section the page is currently scrolled to, rather than the whole page.",
-        inputSchema: {
-          type: "object",
-          properties: {
-            path: {
-              type: "string",
-              description: 'Navigate to this path before capturing (e.g. "/settings"). If omitted, screenshots the current page.'
-            }
-          }
-        }
       }
     ];
     BROWSER_EXTERNAL_TOOLS = /* @__PURE__ */ new Set(["browserCommand"]);
@@ -4284,6 +4278,7 @@ var init_prompt2 = __esm({
 async function runBrowserAutomation(task, context, opts) {
   const release = await acquireBrowserLock();
   try {
+    let lastBrowserCommandViewport;
     const result = await runSubAgent({
       system: getBrowserAutomationPrompt(),
       task,
@@ -4305,11 +4300,11 @@ async function runBrowserAutomation(task, context, opts) {
             return `Error setting up browser: ${err.message}`;
           }
         }
-        if (name === "screenshotFullPage" || name === "screenshotViewport") {
+        if (name === "screenshotFullPage") {
           try {
             return await captureAndAnalyzeScreenshot({
               path: _input.path,
-              fullPage: name === "screenshotFullPage",
+              fullPage: true,
               onLog,
               model: resolveModel(
                 "imageAnalysis",
@@ -4342,6 +4337,11 @@ async function runBrowserAutomation(task, context, opts) {
               (s) => s.command === "screenshotViewport" && s.result?.url
             );
             if (screenshotSteps.length > 0) {
+              const lastStep = screenshotSteps[screenshotSteps.length - 1];
+              lastBrowserCommandViewport = {
+                url: lastStep.result.url,
+                styleMap: lastStep.result.styleMap
+              };
               const visionOverride = {
                 model: resolveModel(
                   "imageAnalysis",
@@ -4385,11 +4385,11 @@ async function runBrowserAutomation(task, context, opts) {
         return result2;
       },
       toolRegistry: context.toolRegistry,
-      captureArtifacts: ["screenshotFullPage", "screenshotViewport"]
+      captureArtifacts: ["screenshotFullPage"]
     });
     context.subAgentMessages?.set(context.toolCallId, result.messages);
-    const viewport = result.artifacts?.screenshotViewport;
     const fullPage = result.artifacts?.screenshotFullPage;
+    const viewport = lastBrowserCommandViewport;
     const preferred = opts?.capture === "viewport" ? viewport ?? fullPage : fullPage ?? viewport;
     return {
       text: result.text,

package/dist/subagents/browserAutomation/prompt.md CHANGED Viewed

@@ -43,7 +43,7 @@ Note: the snapshot concatenates inline text and strips whitespace. If you need t
 - `navigate`: Navigate to a new URL within the app. Waits for the new page to load before continuing with subsequent steps. Use this instead of evaluate with `window.location.href` when you need to navigate and then continue interacting with the new page. Steps after navigate execute on the new page automatically.
 - `evaluate`: Run arbitrary JavaScript in the page and return the result.
 - `styles`: Read computed CSS styles from page elements. Pass a `properties` array with camelCase CSS property names (e.g., `["backgroundColor", "borderRadius", "fontSize"]`). Omit `properties` for a default set covering colors, typography, spacing, borders, shadows, dimensions, and layout. Uses the same targeting as click/type (ref, text, role, label, selector). Omit the target to get styles for all elements from the last snapshot.
-- `screenshotViewport`: Take a screenshot of the current viewport. Returns CDN url with full text analysis and dimensions. Useful at the end of an action batch to visually see things like layout shift or overflow. Do not use if you can get what you need with other tools - only use when you need to visually see the viewport.
+- `screenshotViewport`: Take a screenshot of the visible viewport. Returns CDN url with full text analysis and dimensions. To capture a specific section, set `scrollToSelector` (a CSS selector) — or `scrollY` (an absolute offset) — on this same step; it scrolls the target into view and captures it atomically, so you do NOT need a separate scroll step. Do not use if you can get what you need with other tools - only use when you need to visually see the viewport.
 ### Element targeting (tried in order)
@@ -109,6 +109,15 @@ Select a dropdown option and screenshot the result:
 }
 ```
+Capture a specific below-the-fold section (scroll + capture in one atomic step):
+```json
+{
+  "steps": [
+    { "command": "screenshotViewport", "scrollToSelector": "#pricing" }
+  ]
+}
+```
 Navigate to a sub-page and interact with it:
 ```json
 {
@@ -140,7 +149,9 @@ Check a count with evaluate:
 </examples>
 ### Final Screenshot
-You can use the `screenshotFullPage` tool to take a full-height screenshot of the current page, or the `screenshotViewport` tool to capture just the visible viewport (faster, and the right choice when the task is about a specific section you've scrolled to). Both return the screenshot URL plus a full-text description. If the task asked for a viewport/section view, end with `screenshotViewport`; if it asked for the whole page, end with `screenshotFullPage`.
+How you take the final screenshot depends on what the task asked for:
+- **Whole page** → use the standalone `screenshotFullPage` tool. It takes a full-height screenshot of the current page and returns the URL plus a full-text description.
+- **A specific section / viewport** → use a `browserCommand` batch ending in a `screenshotViewport` step with `scrollToSelector` set to the section (e.g. `{ "command": "screenshotViewport", "scrollToSelector": "#pricing" }`). This scrolls the section into view and captures it in one atomic step. Do this rather than a separate scroll step followed by a capture — capturing the viewport is only reliable when the scroll and the shot are in the same step.
 <rules>
   - Always batch steps into a single browserCommand call. Don't send one step per turn. Type + click + wait should be one call, not three separate turns.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@mindstudio-ai/remy",
-  "version": "0.1.193",
+  "version": "0.1.194",
   "description": "MindStudio coding agent",
   "repository": {
     "type": "git",