npm - @mindstudio-ai/remy - Versions diffs - 0.1.108 → 0.1.110 - Mend

@mindstudio-ai/remy 0.1.108 → 0.1.110

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/headless.js +50 -16
package/dist/index.js +53 -18
package/dist/prompt/compiled/interfaces.md +2 -2
package/dist/prompt/compiled/sdk-actions.md +1 -0
package/dist/prompt/sources/llms.txt +1 -0
package/dist/prompt/static/instructions.md +1 -1
package/dist/prompt/static/intake.md +1 -1
package/dist/subagents/designExpert/prompts/instructions.md +1 -1
package/package.json +1 -1

package/dist/headless.js CHANGED Viewed

@@ -414,7 +414,7 @@ ${isLspConfigured() ? `<typescript_lsp>
 <conversation_summaries>
 Your conversation history may include <prior_conversation_summary> blocks in the user's messages. These are automated summaries of earlier messages that have been compacted to save context space. The user does not see this summary, they see the full conversation history in their UI. Treat the summary as ground truth for what happened before, but do not reference it directly to the user ("as mentioned in the summary..."). Just continue naturally as if you remember the prior work.
-Old tool results are periodically cleared from the conversation to save context space. This is automatic and expected \u2014 you don't need to note down or preserve information from tool results. If you need to reference something from an earlier tool call, just re-read the file or re-run the query, or use your .remy-notes.md file.
+Old tool results are periodically cleared from the conversation to save context space. This is automatic and expected \u2014 you don't need to note down or preserve information from tool results. If you need to reference something from an earlier tool call, just re-read the file or re-run the query.
 </conversation_summaries>
 <project_onboarding>
@@ -2454,9 +2454,24 @@ async function analyzeImage(params) {
 // src/tools/_helpers/screenshot.ts
 var SCREENSHOT_ANALYSIS_PROMPT = `Describe everything visible on screen from top to bottom \u2014 every element, its position, its size relative to the viewport, its colors, its content. Be comprehensive, thorough, and spatial. After the inventory, note anything that looks visually broken (overlapping elements, clipped text, misaligned components).
-Note: ignore text wrapping issues. Screenshots occasionally show text wrapping onto an extra line compared to the live page \u2014 most noticeable in buttons, badges, and headings. This is a known limitation of SVG foreignObject rendering used the DOM-to-image capture library that took the screenshot. The browser's SVG renderer computes slightly wider text metrics than the HTML layout engine, so text that fits on one line in the live DOM can overflow by a fraction of a pixel in the capture - this is not a real issue.
 Respond only with your analysis as Markdown and absolutely no other text. Do not use emojis - use unicode if you need symbols.`;
+var TEXT_WRAP_DISCLAIMER = `Note: ignore text wrapping issues. Screenshots occasionally show text wrapping onto an extra line compared to the live page \u2014 most noticeable in buttons, badges, and headings. This is a known limitation of SVG foreignObject rendering used the DOM-to-image capture library that took the screenshot. The browser's SVG renderer computes slightly wider text metrics than the HTML layout engine, so text that fits on one line in the live DOM can overflow by a fraction of a pixel in the capture - this is not a real issue.`;
+function buildScreenshotAnalysisPrompt(opts) {
+  let p = opts?.prompt || SCREENSHOT_ANALYSIS_PROMPT;
+  if (opts?.styleMap) {
+    p += `
+The following styleMap describes the computed layout state at the moment of capture. Use it to verify typography, spacing, overflow, and element dimensions \u2014 it is more accurate than visual estimation from the image.
+<style_map>
+${opts.styleMap}
+</style_map>`;
+  }
+  p += `
+${TEXT_WRAP_DISCLAIMER}`;
+  return p;
+}
 async function captureAndAnalyzeScreenshot(promptOrOptions) {
   let prompt;
   let existingUrl;
@@ -2471,6 +2486,7 @@ async function captureAndAnalyzeScreenshot(promptOrOptions) {
     prompt = promptOrOptions;
   }
   let url;
+  let styleMap;
   if (existingUrl) {
     url = existingUrl;
   } else {
@@ -2485,17 +2501,21 @@ async function captureAndAnalyzeScreenshot(promptOrOptions) {
         `No URL in sidecar response. The browser may not be ready yet. Response: ${JSON.stringify(ssResult)}`
       );
     }
+    styleMap = ssResult?.styleMap;
   }
   if (prompt === false) {
     return url;
   }
-  const analysisPrompt = prompt || SCREENSHOT_ANALYSIS_PROMPT;
+  const analysisPrompt = buildScreenshotAnalysisPrompt({
+    prompt: prompt || void 0,
+    styleMap
+  });
   const analysis = await analyzeImage({
     prompt: analysisPrompt,
     imageUrl: url,
     onLog
   });
-  return JSON.stringify({ url, analysis });
+  return JSON.stringify({ url, analysis, ...styleMap ? { styleMap } : {} });
 }
 // src/tools/code/screenshot.ts
@@ -3280,7 +3300,9 @@ var browserAutomationTool = {
                   stepType: "analyzeImage",
                   step: {
                     imageUrl: s.result.url,
-                    prompt: SCREENSHOT_ANALYSIS_PROMPT
+                    prompt: buildScreenshotAnalysisPrompt({
+                      styleMap: s.result.styleMap
+                    })
                   }
                 }));
                 const batchResult = await runCli(
@@ -3459,12 +3481,6 @@ __export(analyzeImage_exports, {
   definition: () => definition4,
   execute: () => execute4
 });
-var DEFAULT_PROMPT = `
-Describe everything visible in this image \u2014 every element, its position, its size relative to the frame, its colors, its content. Be comprehensive, thorough and spatial. After the inventory, note anything that looks visually broken (overlapping elements, clipped text, misaligned components).
-Note: ignore text wrapping issues. Screenshots occasionally show text wrapping onto an extra line compared to the live page \u2014 most noticeable in buttons, badges, and headings. This is a known limitation of SVG foreignObject rendering used the DOM-to-image capture library that took the screenshot. The browser's SVG renderer computes slightly wider text metrics than the HTML layout engine, so text that fits on one line in the live DOM can overflow by a fraction of a pixel in the capture - this is not a real issue.
-Respond only with your analysis as Markdown and absolutely no other text. Do not use emojis - use unicode if you need symbols.`;
 var definition4 = {
   clearable: true,
   name: "analyzeImage",
@@ -3486,7 +3502,9 @@ var definition4 = {
 };
 async function execute4(input, onLog) {
   const imageUrl = input.imageUrl;
-  const prompt = input.prompt || DEFAULT_PROMPT;
+  const prompt = buildScreenshotAnalysisPrompt({
+    prompt: input.prompt
+  });
   const analysis = await analyzeImage({
     prompt,
     imageUrl,
@@ -3535,13 +3553,26 @@ async function execute5(input, onLog, context) {
         return `Error: browser navigation completed but no screenshot URL was returned. Agent output: ${result}`;
       }
       const url = urlMatch[0];
-      const analysisPrompt = input.prompt || SCREENSHOT_ANALYSIS_PROMPT;
+      let styleMap;
+      try {
+        const parsed = JSON.parse(result);
+        styleMap = parsed?.styleMap;
+      } catch {
+      }
+      const analysisPrompt = buildScreenshotAnalysisPrompt({
+        prompt: input.prompt,
+        styleMap
+      });
       const analysis = await analyzeImage({
         prompt: analysisPrompt,
         imageUrl: url,
         onLog
       });
-      return JSON.stringify({ url, analysis });
+      return JSON.stringify({
+        url,
+        analysis,
+        ...styleMap ? { styleMap } : {}
+      });
     } catch (err) {
       return `Error taking interactive screenshot: ${err.message}`;
     }
@@ -4576,7 +4607,7 @@ var SANITY_CHECK_TOOLS = [
   },
   {
     name: "askMindStudioSdk",
-    description: "Check if the MindStudio SDK has a managed action for something before writing custom code.",
+    description: "Check if the MindStudio SDK has a managed action for something before writing custom code. Use bullet points to ask many questions at once.",
     inputSchema: {
       type: "object",
       properties: {
@@ -6041,6 +6072,9 @@ ${xmlParts}
       if (pending) {
         pendingTools.delete(id);
         pending.resolve(result);
+      } else if (!running) {
+        log10.info("Late tool_result while idle, dismissing", { id });
+        emit("completed", { success: true }, requestId);
       } else {
         earlyResults.set(id, result);
       }

package/dist/index.js CHANGED Viewed

@@ -2154,6 +2154,22 @@ var init_analyzeImage = __esm({
 });
 // src/tools/_helpers/screenshot.ts
+function buildScreenshotAnalysisPrompt(opts) {
+  let p = opts?.prompt || SCREENSHOT_ANALYSIS_PROMPT;
+  if (opts?.styleMap) {
+    p += `
+The following styleMap describes the computed layout state at the moment of capture. Use it to verify typography, spacing, overflow, and element dimensions \u2014 it is more accurate than visual estimation from the image.
+<style_map>
+${opts.styleMap}
+</style_map>`;
+  }
+  p += `
+${TEXT_WRAP_DISCLAIMER}`;
+  return p;
+}
 async function captureAndAnalyzeScreenshot(promptOrOptions) {
   let prompt;
   let existingUrl;
@@ -2168,6 +2184,7 @@ async function captureAndAnalyzeScreenshot(promptOrOptions) {
     prompt = promptOrOptions;
   }
   let url;
+  let styleMap;
   if (existingUrl) {
     url = existingUrl;
   } else {
@@ -2182,19 +2199,23 @@ async function captureAndAnalyzeScreenshot(promptOrOptions) {
         `No URL in sidecar response. The browser may not be ready yet. Response: ${JSON.stringify(ssResult)}`
       );
     }
+    styleMap = ssResult?.styleMap;
   }
   if (prompt === false) {
     return url;
   }
-  const analysisPrompt = prompt || SCREENSHOT_ANALYSIS_PROMPT;
+  const analysisPrompt = buildScreenshotAnalysisPrompt({
+    prompt: prompt || void 0,
+    styleMap
+  });
   const analysis = await analyzeImage({
     prompt: analysisPrompt,
     imageUrl: url,
     onLog
   });
-  return JSON.stringify({ url, analysis });
+  return JSON.stringify({ url, analysis, ...styleMap ? { styleMap } : {} });
 }
-var SCREENSHOT_ANALYSIS_PROMPT;
+var SCREENSHOT_ANALYSIS_PROMPT, TEXT_WRAP_DISCLAIMER;
 var init_screenshot = __esm({
   "src/tools/_helpers/screenshot.ts"() {
     "use strict";
@@ -2202,9 +2223,8 @@ var init_screenshot = __esm({
     init_analyzeImage();
     SCREENSHOT_ANALYSIS_PROMPT = `Describe everything visible on screen from top to bottom \u2014 every element, its position, its size relative to the viewport, its colors, its content. Be comprehensive, thorough, and spatial. After the inventory, note anything that looks visually broken (overlapping elements, clipped text, misaligned components).
-Note: ignore text wrapping issues. Screenshots occasionally show text wrapping onto an extra line compared to the live page \u2014 most noticeable in buttons, badges, and headings. This is a known limitation of SVG foreignObject rendering used the DOM-to-image capture library that took the screenshot. The browser's SVG renderer computes slightly wider text metrics than the HTML layout engine, so text that fits on one line in the live DOM can overflow by a fraction of a pixel in the capture - this is not a real issue.
 Respond only with your analysis as Markdown and absolutely no other text. Do not use emojis - use unicode if you need symbols.`;
+    TEXT_WRAP_DISCLAIMER = `Note: ignore text wrapping issues. Screenshots occasionally show text wrapping onto an extra line compared to the live page \u2014 most noticeable in buttons, badges, and headings. This is a known limitation of SVG foreignObject rendering used the DOM-to-image capture library that took the screenshot. The browser's SVG renderer computes slightly wider text metrics than the HTML layout engine, so text that fits on one line in the live DOM can overflow by a fraction of a pixel in the capture - this is not a real issue.`;
   }
 });
@@ -3084,7 +3104,9 @@ var init_browserAutomation = __esm({
                       stepType: "analyzeImage",
                       step: {
                         imageUrl: s.result.url,
-                        prompt: SCREENSHOT_ANALYSIS_PROMPT
+                        prompt: buildScreenshotAnalysisPrompt({
+                          styleMap: s.result.styleMap
+                        })
                       }
                     }));
                     const batchResult = await runCli(
@@ -3289,7 +3311,9 @@ __export(analyzeImage_exports, {
 });
 async function execute4(input, onLog) {
   const imageUrl = input.imageUrl;
-  const prompt = input.prompt || DEFAULT_PROMPT;
+  const prompt = buildScreenshotAnalysisPrompt({
+    prompt: input.prompt
+  });
   const analysis = await analyzeImage({
     prompt,
     imageUrl,
@@ -3297,17 +3321,12 @@ async function execute4(input, onLog) {
   });
   return JSON.stringify({ url: imageUrl, analysis });
 }
-var DEFAULT_PROMPT, definition4;
+var definition4;
 var init_analyzeImage2 = __esm({
   "src/subagents/designExpert/tools/analyzeImage.ts"() {
     "use strict";
     init_analyzeImage();
-    DEFAULT_PROMPT = `
-Describe everything visible in this image \u2014 every element, its position, its size relative to the frame, its colors, its content. Be comprehensive, thorough and spatial. After the inventory, note anything that looks visually broken (overlapping elements, clipped text, misaligned components).
-Note: ignore text wrapping issues. Screenshots occasionally show text wrapping onto an extra line compared to the live page \u2014 most noticeable in buttons, badges, and headings. This is a known limitation of SVG foreignObject rendering used the DOM-to-image capture library that took the screenshot. The browser's SVG renderer computes slightly wider text metrics than the HTML layout engine, so text that fits on one line in the live DOM can overflow by a fraction of a pixel in the capture - this is not a real issue.
-Respond only with your analysis as Markdown and absolutely no other text. Do not use emojis - use unicode if you need symbols.`;
+    init_screenshot();
     definition4 = {
       clearable: true,
       name: "analyzeImage",
@@ -3348,13 +3367,26 @@ async function execute5(input, onLog, context) {
         return `Error: browser navigation completed but no screenshot URL was returned. Agent output: ${result}`;
       }
       const url = urlMatch[0];
-      const analysisPrompt = input.prompt || SCREENSHOT_ANALYSIS_PROMPT;
+      let styleMap;
+      try {
+        const parsed = JSON.parse(result);
+        styleMap = parsed?.styleMap;
+      } catch {
+      }
+      const analysisPrompt = buildScreenshotAnalysisPrompt({
+        prompt: input.prompt,
+        styleMap
+      });
       const analysis = await analyzeImage({
         prompt: analysisPrompt,
         imageUrl: url,
         onLog
       });
-      return JSON.stringify({ url, analysis });
+      return JSON.stringify({
+        url,
+        analysis,
+        ...styleMap ? { styleMap } : {}
+      });
     } catch (err) {
       return `Error taking interactive screenshot: ${err.message}`;
     }
@@ -4562,7 +4594,7 @@ var init_tools4 = __esm({
       },
       {
         name: "askMindStudioSdk",
-        description: "Check if the MindStudio SDK has a managed action for something before writing custom code.",
+        description: "Check if the MindStudio SDK has a managed action for something before writing custom code. Use bullet points to ask many questions at once.",
         inputSchema: {
           type: "object",
           properties: {
@@ -5825,7 +5857,7 @@ ${isLspConfigured() ? `<typescript_lsp>
 <conversation_summaries>
 Your conversation history may include <prior_conversation_summary> blocks in the user's messages. These are automated summaries of earlier messages that have been compacted to save context space. The user does not see this summary, they see the full conversation history in their UI. Treat the summary as ground truth for what happened before, but do not reference it directly to the user ("as mentioned in the summary..."). Just continue naturally as if you remember the prior work.
-Old tool results are periodically cleared from the conversation to save context space. This is automatic and expected \u2014 you don't need to note down or preserve information from tool results. If you need to reference something from an earlier tool call, just re-read the file or re-run the query, or use your .remy-notes.md file.
+Old tool results are periodically cleared from the conversation to save context space. This is automatic and expected \u2014 you don't need to note down or preserve information from tool results. If you need to reference something from an earlier tool call, just re-read the file or re-run the query.
 </conversation_summaries>
 <project_onboarding>
@@ -6685,6 +6717,9 @@ ${xmlParts}
       if (pending) {
         pendingTools.delete(id);
         pending.resolve(result);
+      } else if (!running) {
+        log10.info("Late tool_result while idle, dismissing", { id });
+        emit("completed", { success: true }, requestId);
       } else {
         earlyResults.set(id, result);
       }

package/dist/prompt/compiled/interfaces.md CHANGED Viewed

@@ -251,7 +251,7 @@ The human-readable spec. Frontmatter contains structured fields; the prose body
 ```yaml
 ---
 name: Todo Assistant
-model: {"model": "claude-4-5-haiku", "temperature": 0.5, "maxResponseTokens": 15000}
+model: {"model": "claude-4-5-haiku", "temperature": 0.5, "maxResponseTokens": 16000}
 description: Conversational agent that helps users manage their to-do list.
 ---
 ```
@@ -282,7 +282,7 @@ dist/interfaces/agent/
   "agent": {
     "model": "claude-4-5-haiku",
     "temperature": 0.5,
-    "maxTokens": 15000,
+    "maxTokens": 16000,
     "systemPrompt": "system.md",
     "tools": [
       { "method": "create-todo", "description": "tools/createTodo.md" },

package/dist/prompt/compiled/sdk-actions.md CHANGED Viewed

@@ -127,6 +127,7 @@ const { content } = await agent.generateText({
   modelOverride: {
     model: 'claude-sonnet-4-6',
     temperature: 0.7,
+    maxResponseTokens: 16000,
   },
 });
 ```

package/dist/prompt/sources/llms.txt CHANGED Viewed

@@ -136,6 +136,7 @@ const { content } = await agent.generateText({
   modelOverride: {
     model: model.id,
     temperature: 0.7,
+    maxResponseTokens: 16000,
   },
 });
 ```

package/dist/prompt/static/instructions.md CHANGED Viewed

@@ -17,7 +17,7 @@
 - Pushing to main branch will trigger a deploy. The user presses the publish button in the interface to request publishing.
 ### Build Notes
-For complex tasks — especially an initial buildout from a spec or making multiple changes in a single turn — write a `.remy-notes.md` scratchpad in the project root. Use it to track progress: a checklist of what's been built and what's remaining. Do not include implementation details in th notes. Read the spec files directly when you need reference data. Delete the notes file when your work is done.
+For complex tasks — especially an initial buildout from a spec or making multiple changes in a single turn — write a `.remy-notes.md` scratchpad in the project root. Use it to track progress: a checklist of what's been built and what's remaining. Do not include implementation details or other decisions in the notes - it is solely for keeping track of tasks. Read the spec files directly when you need design details, implementation decisions, or other reference materials. Delete the notes file when your work is done.
 ## Communication
 The user can already see your tool calls, so most of your work is visible without narration. Focus text output on three things:

package/dist/prompt/static/intake.md CHANGED Viewed

@@ -25,7 +25,7 @@ Don't recite this list to users. Use it to calibrate your sense of what's possib
 - **Automations** — cron jobs that monitor competitors and send alerts, webhook handlers that sync data between services, email processors that triage support requests — no UI needed
 - **Conversational AI agents** — custom chat UIs backed by any model, with tool access to the app's methods. Full control over what the agent can do and who can use it
 - **Bots & agent tools** — Discord slash-command bots, Telegram bots, MCP tool servers for AI assistants
-- **Creative projects** — browser games with p5.js or Three.js, interactive visualizations, generative art, portfolio sites with dynamic backends
+- **Creative projects** — browser games with p5.js or Three.js, interactive visualizations, 3D things, generative art, portfolio sites with dynamic backends
 - **Marketing & launch pages** — landing pages, waitlist pages with referral mechanics, product sites with scroll animations — visual polish is a strength here
 - **API services** — backend logic exposed as REST endpoints
 - **Simple static sites** — no backend needed, just a web interface with a build step

package/dist/subagents/designExpert/prompts/instructions.md CHANGED Viewed

@@ -12,7 +12,7 @@ Think about the ways you can truly elevate the design. Use image generation to c
 ## Tool Usage
 - When multiple tool calls are independent, make them all in a single turn. Searching for three different products, or fetching two reference sites: batch them instead of doing one per turn.
-- The screenshot tool supports an `instructions` parameter for taking screenshots that require interaction first. If you need to screenshot a state that's behind a modal, a specific tab, or a multi-step flow, pass `instructions` describing how to get there (e.g., "dismiss the welcome modal, then click XYZ"). A browser automation agent will follow your instructions and capture the screenshot for you.
+- The screenshot tool supports an `instructions` parameter for taking screenshots that require interaction first. If you need to screenshot a state that's behind a modal, a specific tab, or a multi-step flow, pass `instructions` describing how to get there (e.g., "dismiss the welcome modal, then click XYZ"). A browser automation agent will follow your instructions and capture the screenshot for you. You can not use this to scroll - you will always receive a full page screenshot. Only use this if you need to trigger stateful changes within the app to get the full-page screenshot.
 - After you've taken a screenshot, use analyze image to ask different questions about it - don't re-screenshot the page unnecessarily.
 ## Voice

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@mindstudio-ai/remy",
-  "version": "0.1.108",
+  "version": "0.1.110",
   "description": "MindStudio coding agent",
   "repository": {
     "type": "git",