@mindstudio-ai/remy 0.1.64 → 0.1.66

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/headless.js CHANGED
@@ -574,7 +574,7 @@ async function* streamChat(params) {
574
574
  var MAX_RETRIES = 3;
575
575
  var INITIAL_BACKOFF_MS = 1e3;
576
576
  function isRetryableError(error) {
577
- return /Network error/i.test(error) || /HTTP 5\d\d/i.test(error) || /Stream stalled/i.test(error);
577
+ return /Network error/i.test(error) || /HTTP 5\d\d/i.test(error) || /Stream stalled/i.test(error) || /overloaded/i.test(error);
578
578
  }
579
579
  function sleep(ms) {
580
580
  return new Promise((resolve) => setTimeout(resolve, ms));
@@ -1892,7 +1892,7 @@ function formatOccurrenceError(count, lines, filePath) {
1892
1892
  var editFileTool = {
1893
1893
  definition: {
1894
1894
  name: "editFile",
1895
- description: "Replace a string in a file. old_string must appear exactly once (minor indentation differences are handled automatically). Set replace_all to true to replace every occurrence at once. For bulk mechanical substitutions (renaming a variable, swapping colors), prefer replace_all. Always read the file first so you know the exact text to match.",
1895
+ description: "Replace a string in a file. old_string must appear exactly once (minor indentation differences are handled automatically). Set replace_all to true to replace every occurrence at once. For bulk mechanical substitutions (renaming a variable, swapping colors), prefer replace_all. Always read the file first so you know the exact text to match. When editing nested structures (objects, function bodies, arrays, template literals), always include the full enclosing structure in old_string rather than just an inner fragment. Replacing a partial slice from the middle of nested code is the most common source of syntax errors.",
1896
1896
  inputSchema: {
1897
1897
  type: "object",
1898
1898
  properties: {
@@ -2501,6 +2501,37 @@ function findLastSummaryCheckpoint(messages, name) {
2501
2501
  }
2502
2502
  return -1;
2503
2503
  }
2504
+ function fixOrphanedToolCalls(messages) {
2505
+ const toolResultIds = /* @__PURE__ */ new Set();
2506
+ for (const msg of messages) {
2507
+ if (msg.role === "user" && msg.toolCallId) {
2508
+ toolResultIds.add(msg.toolCallId);
2509
+ }
2510
+ }
2511
+ const result = [...messages];
2512
+ for (let i = result.length - 1; i >= 0; i--) {
2513
+ const msg = result[i];
2514
+ if (msg.role !== "assistant" || !Array.isArray(msg.content)) {
2515
+ continue;
2516
+ }
2517
+ const toolBlocks = msg.content.filter(
2518
+ (b) => b.type === "tool"
2519
+ );
2520
+ const orphans = toolBlocks.filter((tc) => !toolResultIds.has(tc.id));
2521
+ if (orphans.length === 0) {
2522
+ continue;
2523
+ }
2524
+ const synthetics = orphans.map((tc) => ({
2525
+ role: "user",
2526
+ content: "Error: tool result lost (session recovered)",
2527
+ toolCallId: tc.id,
2528
+ isToolError: true
2529
+ }));
2530
+ result.splice(i + 1, 0, ...synthetics);
2531
+ break;
2532
+ }
2533
+ return result;
2534
+ }
2504
2535
  function cleanMessagesForApi(messages) {
2505
2536
  const checkpointIdx = findLastSummaryCheckpoint(messages, "conversation");
2506
2537
  let startIdx = 0;
@@ -2522,7 +2553,7 @@ ${summaryBlock.text}
2522
2553
  }
2523
2554
  startIdx = checkpointIdx + 1;
2524
2555
  }
2525
- const messagesToProcess = messages.slice(startIdx);
2556
+ const messagesToProcess = fixOrphanedToolCalls(messages.slice(startIdx));
2526
2557
  const toolUseIds = /* @__PURE__ */ new Set();
2527
2558
  for (const msg of messagesToProcess) {
2528
2559
  if (msg.role === "assistant" && Array.isArray(msg.content)) {
@@ -3345,11 +3376,37 @@ var definition5 = {
3345
3376
  path: {
3346
3377
  type: "string",
3347
3378
  description: 'Navigate to this path before capturing (e.g. "/settings"). If omitted, screenshots the current page.'
3379
+ },
3380
+ instructions: {
3381
+ type: "string",
3382
+ description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow), describe the steps to get there. A browser automation agent will follow these instructions before capturing the screenshot."
3348
3383
  }
3349
3384
  }
3350
3385
  }
3351
3386
  };
3352
- async function execute5(input, onLog) {
3387
+ async function execute5(input, onLog, context) {
3388
+ if (input.instructions && context) {
3389
+ try {
3390
+ const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a full-page screenshot.` : `${input.instructions}. After completing these steps, take a full-page screenshot.`;
3391
+ const result = await browserAutomationTool.execute({ task }, context);
3392
+ const urlMatch = result.match(
3393
+ /https:\/\/[^\s"')]+\.(?:png|jpg|jpeg|webp)/i
3394
+ );
3395
+ if (!urlMatch) {
3396
+ return `Error: browser navigation completed but no screenshot URL was returned. Agent output: ${result}`;
3397
+ }
3398
+ const url = urlMatch[0];
3399
+ const analysisPrompt = input.prompt || SCREENSHOT_ANALYSIS_PROMPT;
3400
+ const analysis = await analyzeImage({
3401
+ prompt: analysisPrompt,
3402
+ imageUrl: url,
3403
+ onLog
3404
+ });
3405
+ return JSON.stringify({ url, analysis });
3406
+ } catch (err) {
3407
+ return `Error taking interactive screenshot: ${err.message}`;
3408
+ }
3409
+ }
3353
3410
  try {
3354
3411
  return await captureAndAnalyzeScreenshot({
3355
3412
  prompt: input.prompt,
@@ -3618,7 +3675,7 @@ async function executeDesignExpertTool(name, input, context, toolCallId, onLog)
3618
3675
  if (!tool) {
3619
3676
  return `Error: unknown tool "${name}"`;
3620
3677
  }
3621
- return tool.execute(input, onLog);
3678
+ return tool.execute(input, onLog, context);
3622
3679
  }
3623
3680
 
3624
3681
  // src/subagents/designExpert/prompt.ts
package/dist/index.js CHANGED
@@ -221,7 +221,7 @@ async function* streamChat(params) {
221
221
  }
222
222
  }
223
223
  function isRetryableError(error) {
224
- return /Network error/i.test(error) || /HTTP 5\d\d/i.test(error) || /Stream stalled/i.test(error);
224
+ return /Network error/i.test(error) || /HTTP 5\d\d/i.test(error) || /Stream stalled/i.test(error) || /overloaded/i.test(error);
225
225
  }
226
226
  function sleep(ms) {
227
227
  return new Promise((resolve) => setTimeout(resolve, ms));
@@ -1477,7 +1477,7 @@ var init_editFile = __esm({
1477
1477
  editFileTool = {
1478
1478
  definition: {
1479
1479
  name: "editFile",
1480
- description: "Replace a string in a file. old_string must appear exactly once (minor indentation differences are handled automatically). Set replace_all to true to replace every occurrence at once. For bulk mechanical substitutions (renaming a variable, swapping colors), prefer replace_all. Always read the file first so you know the exact text to match.",
1480
+ description: "Replace a string in a file. old_string must appear exactly once (minor indentation differences are handled automatically). Set replace_all to true to replace every occurrence at once. For bulk mechanical substitutions (renaming a variable, swapping colors), prefer replace_all. Always read the file first so you know the exact text to match. When editing nested structures (objects, function bodies, arrays, template literals), always include the full enclosing structure in old_string rather than just an inner fragment. Replacing a partial slice from the middle of nested code is the most common source of syntax errors.",
1481
1481
  inputSchema: {
1482
1482
  type: "object",
1483
1483
  properties: {
@@ -2228,6 +2228,37 @@ function findLastSummaryCheckpoint(messages, name) {
2228
2228
  }
2229
2229
  return -1;
2230
2230
  }
2231
+ function fixOrphanedToolCalls(messages) {
2232
+ const toolResultIds = /* @__PURE__ */ new Set();
2233
+ for (const msg of messages) {
2234
+ if (msg.role === "user" && msg.toolCallId) {
2235
+ toolResultIds.add(msg.toolCallId);
2236
+ }
2237
+ }
2238
+ const result = [...messages];
2239
+ for (let i = result.length - 1; i >= 0; i--) {
2240
+ const msg = result[i];
2241
+ if (msg.role !== "assistant" || !Array.isArray(msg.content)) {
2242
+ continue;
2243
+ }
2244
+ const toolBlocks = msg.content.filter(
2245
+ (b) => b.type === "tool"
2246
+ );
2247
+ const orphans = toolBlocks.filter((tc) => !toolResultIds.has(tc.id));
2248
+ if (orphans.length === 0) {
2249
+ continue;
2250
+ }
2251
+ const synthetics = orphans.map((tc) => ({
2252
+ role: "user",
2253
+ content: "Error: tool result lost (session recovered)",
2254
+ toolCallId: tc.id,
2255
+ isToolError: true
2256
+ }));
2257
+ result.splice(i + 1, 0, ...synthetics);
2258
+ break;
2259
+ }
2260
+ return result;
2261
+ }
2231
2262
  function cleanMessagesForApi(messages) {
2232
2263
  const checkpointIdx = findLastSummaryCheckpoint(messages, "conversation");
2233
2264
  let startIdx = 0;
@@ -2249,7 +2280,7 @@ ${summaryBlock.text}
2249
2280
  }
2250
2281
  startIdx = checkpointIdx + 1;
2251
2282
  }
2252
- const messagesToProcess = messages.slice(startIdx);
2283
+ const messagesToProcess = fixOrphanedToolCalls(messages.slice(startIdx));
2253
2284
  const toolUseIds = /* @__PURE__ */ new Set();
2254
2285
  for (const msg of messagesToProcess) {
2255
2286
  if (msg.role === "assistant" && Array.isArray(msg.content)) {
@@ -3172,7 +3203,29 @@ __export(screenshot_exports, {
3172
3203
  definition: () => definition5,
3173
3204
  execute: () => execute5
3174
3205
  });
3175
- async function execute5(input, onLog) {
3206
+ async function execute5(input, onLog, context) {
3207
+ if (input.instructions && context) {
3208
+ try {
3209
+ const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a full-page screenshot.` : `${input.instructions}. After completing these steps, take a full-page screenshot.`;
3210
+ const result = await browserAutomationTool.execute({ task }, context);
3211
+ const urlMatch = result.match(
3212
+ /https:\/\/[^\s"')]+\.(?:png|jpg|jpeg|webp)/i
3213
+ );
3214
+ if (!urlMatch) {
3215
+ return `Error: browser navigation completed but no screenshot URL was returned. Agent output: ${result}`;
3216
+ }
3217
+ const url = urlMatch[0];
3218
+ const analysisPrompt = input.prompt || SCREENSHOT_ANALYSIS_PROMPT;
3219
+ const analysis = await analyzeImage({
3220
+ prompt: analysisPrompt,
3221
+ imageUrl: url,
3222
+ onLog
3223
+ });
3224
+ return JSON.stringify({ url, analysis });
3225
+ } catch (err) {
3226
+ return `Error taking interactive screenshot: ${err.message}`;
3227
+ }
3228
+ }
3176
3229
  try {
3177
3230
  return await captureAndAnalyzeScreenshot({
3178
3231
  prompt: input.prompt,
@@ -3188,6 +3241,8 @@ var init_screenshot3 = __esm({
3188
3241
  "src/subagents/designExpert/tools/screenshot.ts"() {
3189
3242
  "use strict";
3190
3243
  init_screenshot();
3244
+ init_analyzeImage();
3245
+ init_browserAutomation();
3191
3246
  definition5 = {
3192
3247
  name: "screenshot",
3193
3248
  description: "Capture a full-height screenshot of the current app preview. Returns a CDN URL along with visual analysis. Use to review the current state of the UI being built. Remember, the screenshot analysis is not overly precise - for example, it cannot reliably identify specific fonts by name \u2014 it can only describe what letterforms look like.",
@@ -3201,6 +3256,10 @@ var init_screenshot3 = __esm({
3201
3256
  path: {
3202
3257
  type: "string",
3203
3258
  description: 'Navigate to this path before capturing (e.g. "/settings"). If omitted, screenshots the current page.'
3259
+ },
3260
+ instructions: {
3261
+ type: "string",
3262
+ description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow), describe the steps to get there. A browser automation agent will follow these instructions before capturing the screenshot."
3204
3263
  }
3205
3264
  }
3206
3265
  }
@@ -3482,7 +3541,7 @@ async function executeDesignExpertTool(name, input, context, toolCallId, onLog)
3482
3541
  if (!tool) {
3483
3542
  return `Error: unknown tool "${name}"`;
3484
3543
  }
3485
- return tool.execute(input, onLog);
3544
+ return tool.execute(input, onLog, context);
3486
3545
  }
3487
3546
  var tools, DESIGN_EXPERT_TOOLS;
3488
3547
  var init_tools2 = __esm({
@@ -38,7 +38,7 @@ Always consult the code sanity check before writing code in initialCodegen with
38
38
 
39
39
  ### QA (`runAutomatedBrowserTest`)
40
40
 
41
- For verifying complex stateful interactions: multi-step form submissions, auth flows, real-time updates, flows that require specific data/role setup. This spins up a full chrome browser automation — it's heavyweight. Do not use it for basic rendering or navigation checks. If you can verify something with a screenshot or by reading the code, do that instead. Run a scenario first to seed test data and set user roles. The user is able to watch QA work on their screen via a live browser preview - the cursor will move, type, etc - so you can also use this to demo functionality to the user and help them understand how to use their app.
41
+ For verifying complex stateful interactions: multi-step form submissions, auth flows, real-time updates, flows that require specific data/role setup. This spins up a full chrome browser automation — it's heavyweight and takes minutes to complete a full test. Do not use it for basic rendering or navigation checks. If you can verify something with a screenshot or by reading the code, do that instead. Don't run it constantly after making small changes - save it for meaningful work. Run a scenario first to seed test data and set user roles. The user is able to watch QA work on their screen via a live browser preview - the cursor will move, type, etc - so you can also use this to demo functionality to the user and help them understand how to use their app.
42
42
 
43
43
  The QA agent can see the screen. Describe what to test, not how — it will figure out what to click, what to check, and what values to use.
44
44
 
@@ -1,5 +1,8 @@
1
1
  You are a browser smoke test agent. You verify that features work end to end by interacting with the live preview. Focus on outcomes: does the feature work? Did the expected content appear? Just do the thing and see if it worked.
2
2
 
3
+ ## Rules to Remember
4
+ - Don't overthink the tests - the goal is to generally make sure things work as expected, not to provide detailed QA. If something seems mostly okay, note it and move on. Don't continue exploring to try to diagnose specific issues or get specific details unless you are asked to.
5
+
3
6
  ## Tester Persona
4
7
  The user is watching the automation happen on their screen in real-time. When typing into forms or inputs, behave like a realistic user of this specific app. Use the app context (if provided) to understand the audience and tone. Type the way that audience would actually type — not formal, not robotic. The app developer's name is Remy, so use that and the email remy@mindstudio.ai as the basis for any testing that requires a persona.
5
8
 
@@ -10,6 +10,7 @@ Then, think about the layout and UI patterns - these are the core of the user's
10
10
 
11
11
  ## Tool Usage
12
12
  - When multiple tool calls are independent, make them all in a single turn. Searching for three different products, or fetching two reference sites: batch them instead of doing one per turn.
13
+ - The screenshot tool supports an `instructions` parameter for taking screenshots that require interaction first. If you need to screenshot a state that's behind a modal, a specific tab, or a multi-step flow, pass `instructions` describing how to get there (e.g., "dismiss the welcome modal, then click XYZ"). A browser automation agent will follow your instructions and capture the screenshot for you.
13
14
 
14
15
  ## Voice
15
16
  - No emoji, no filler.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mindstudio-ai/remy",
3
- "version": "0.1.64",
3
+ "version": "0.1.66",
4
4
  "description": "MindStudio coding agent",
5
5
  "repository": {
6
6
  "type": "git",