@mindstudio-ai/remy 0.1.63 → 0.1.65
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/headless.js
CHANGED
|
@@ -574,7 +574,7 @@ async function* streamChat(params) {
|
|
|
574
574
|
var MAX_RETRIES = 3;
|
|
575
575
|
var INITIAL_BACKOFF_MS = 1e3;
|
|
576
576
|
function isRetryableError(error) {
|
|
577
|
-
return /Network error/i.test(error) || /HTTP 5\d\d/i.test(error) || /Stream stalled/i.test(error);
|
|
577
|
+
return /Network error/i.test(error) || /HTTP 5\d\d/i.test(error) || /Stream stalled/i.test(error) || /overloaded/i.test(error);
|
|
578
578
|
}
|
|
579
579
|
function sleep(ms) {
|
|
580
580
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
@@ -1892,7 +1892,7 @@ function formatOccurrenceError(count, lines, filePath) {
|
|
|
1892
1892
|
var editFileTool = {
|
|
1893
1893
|
definition: {
|
|
1894
1894
|
name: "editFile",
|
|
1895
|
-
description: "Replace a string in a file. old_string must appear exactly once (minor indentation differences are handled automatically). Set replace_all to true to replace every occurrence at once. For bulk mechanical substitutions (renaming a variable, swapping colors), prefer replace_all. Always read the file first so you know the exact text to match.",
|
|
1895
|
+
description: "Replace a string in a file. old_string must appear exactly once (minor indentation differences are handled automatically). Set replace_all to true to replace every occurrence at once. For bulk mechanical substitutions (renaming a variable, swapping colors), prefer replace_all. Always read the file first so you know the exact text to match. When editing nested structures (objects, function bodies, arrays, template literals), always include the full enclosing structure in old_string rather than just an inner fragment. Replacing a partial slice from the middle of nested code is the most common source of syntax errors.",
|
|
1896
1896
|
inputSchema: {
|
|
1897
1897
|
type: "object",
|
|
1898
1898
|
properties: {
|
|
@@ -3345,11 +3345,37 @@ var definition5 = {
|
|
|
3345
3345
|
path: {
|
|
3346
3346
|
type: "string",
|
|
3347
3347
|
description: 'Navigate to this path before capturing (e.g. "/settings"). If omitted, screenshots the current page.'
|
|
3348
|
+
},
|
|
3349
|
+
instructions: {
|
|
3350
|
+
type: "string",
|
|
3351
|
+
description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow), describe the steps to get there. A browser automation agent will follow these instructions before capturing the screenshot."
|
|
3348
3352
|
}
|
|
3349
3353
|
}
|
|
3350
3354
|
}
|
|
3351
3355
|
};
|
|
3352
|
-
async function execute5(input, onLog) {
|
|
3356
|
+
async function execute5(input, onLog, context) {
|
|
3357
|
+
if (input.instructions && context) {
|
|
3358
|
+
try {
|
|
3359
|
+
const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a full-page screenshot.` : `${input.instructions}. After completing these steps, take a full-page screenshot.`;
|
|
3360
|
+
const result = await browserAutomationTool.execute({ task }, context);
|
|
3361
|
+
const urlMatch = result.match(
|
|
3362
|
+
/https:\/\/[^\s"')]+\.(?:png|jpg|jpeg|webp)/i
|
|
3363
|
+
);
|
|
3364
|
+
if (!urlMatch) {
|
|
3365
|
+
return `Error: browser navigation completed but no screenshot URL was returned. Agent output: ${result}`;
|
|
3366
|
+
}
|
|
3367
|
+
const url = urlMatch[0];
|
|
3368
|
+
const analysisPrompt = input.prompt || SCREENSHOT_ANALYSIS_PROMPT;
|
|
3369
|
+
const analysis = await analyzeImage({
|
|
3370
|
+
prompt: analysisPrompt,
|
|
3371
|
+
imageUrl: url,
|
|
3372
|
+
onLog
|
|
3373
|
+
});
|
|
3374
|
+
return JSON.stringify({ url, analysis });
|
|
3375
|
+
} catch (err) {
|
|
3376
|
+
return `Error taking interactive screenshot: ${err.message}`;
|
|
3377
|
+
}
|
|
3378
|
+
}
|
|
3353
3379
|
try {
|
|
3354
3380
|
return await captureAndAnalyzeScreenshot({
|
|
3355
3381
|
prompt: input.prompt,
|
|
@@ -3618,7 +3644,7 @@ async function executeDesignExpertTool(name, input, context, toolCallId, onLog)
|
|
|
3618
3644
|
if (!tool) {
|
|
3619
3645
|
return `Error: unknown tool "${name}"`;
|
|
3620
3646
|
}
|
|
3621
|
-
return tool.execute(input, onLog);
|
|
3647
|
+
return tool.execute(input, onLog, context);
|
|
3622
3648
|
}
|
|
3623
3649
|
|
|
3624
3650
|
// src/subagents/designExpert/prompt.ts
|
|
@@ -4856,6 +4882,9 @@ async function runTurn(params) {
|
|
|
4856
4882
|
let turnCacheCreation = 0;
|
|
4857
4883
|
let turnCacheRead = 0;
|
|
4858
4884
|
let turnLlmCalls = 0;
|
|
4885
|
+
let lastCallInputTokens = 0;
|
|
4886
|
+
let lastCallCacheCreation = 0;
|
|
4887
|
+
let lastCallCacheRead = 0;
|
|
4859
4888
|
while (true) {
|
|
4860
4889
|
let getOrCreateAccumulator2 = function(id, name) {
|
|
4861
4890
|
let acc = toolInputAccumulators.get(id);
|
|
@@ -5051,10 +5080,13 @@ async function runTurn(params) {
|
|
|
5051
5080
|
case "done":
|
|
5052
5081
|
stopReason = event.stopReason;
|
|
5053
5082
|
turnLlmCalls++;
|
|
5054
|
-
|
|
5083
|
+
lastCallInputTokens = event.usage.inputTokens;
|
|
5084
|
+
lastCallCacheCreation = event.usage.cacheCreationTokens ?? 0;
|
|
5085
|
+
lastCallCacheRead = event.usage.cacheReadTokens ?? 0;
|
|
5086
|
+
turnInputTokens += lastCallInputTokens;
|
|
5055
5087
|
turnOutputTokens += event.usage.outputTokens;
|
|
5056
|
-
turnCacheCreation +=
|
|
5057
|
-
turnCacheRead +=
|
|
5088
|
+
turnCacheCreation += lastCallCacheCreation;
|
|
5089
|
+
turnCacheRead += lastCallCacheRead;
|
|
5058
5090
|
break;
|
|
5059
5091
|
case "error":
|
|
5060
5092
|
onEvent({ type: "error", error: friendlyError(event.error) });
|
|
@@ -5099,7 +5131,10 @@ async function runTurn(params) {
|
|
|
5099
5131
|
outputTokens: turnOutputTokens,
|
|
5100
5132
|
cacheCreationTokens: turnCacheCreation || void 0,
|
|
5101
5133
|
cacheReadTokens: turnCacheRead || void 0,
|
|
5102
|
-
llmCalls: turnLlmCalls
|
|
5134
|
+
llmCalls: turnLlmCalls,
|
|
5135
|
+
lastCallInputTokens,
|
|
5136
|
+
lastCallCacheCreation: lastCallCacheCreation || void 0,
|
|
5137
|
+
lastCallCacheRead: lastCallCacheRead || void 0
|
|
5103
5138
|
}
|
|
5104
5139
|
});
|
|
5105
5140
|
return;
|
|
@@ -5550,7 +5585,7 @@ ${xmlParts}
|
|
|
5550
5585
|
sessionStats.totalOutputTokens += e.stats.outputTokens;
|
|
5551
5586
|
sessionStats.totalCacheCreationTokens += e.stats.cacheCreationTokens ?? 0;
|
|
5552
5587
|
sessionStats.totalCacheReadTokens += e.stats.cacheReadTokens ?? 0;
|
|
5553
|
-
sessionStats.lastContextSize = e.stats.inputTokens;
|
|
5588
|
+
sessionStats.lastContextSize = e.stats.lastCallInputTokens ?? e.stats.inputTokens;
|
|
5554
5589
|
}
|
|
5555
5590
|
sessionStats.messageCount = state.messages.length;
|
|
5556
5591
|
sessionStats.updatedAt = Date.now();
|
package/dist/index.js
CHANGED
|
@@ -221,7 +221,7 @@ async function* streamChat(params) {
|
|
|
221
221
|
}
|
|
222
222
|
}
|
|
223
223
|
function isRetryableError(error) {
|
|
224
|
-
return /Network error/i.test(error) || /HTTP 5\d\d/i.test(error) || /Stream stalled/i.test(error);
|
|
224
|
+
return /Network error/i.test(error) || /HTTP 5\d\d/i.test(error) || /Stream stalled/i.test(error) || /overloaded/i.test(error);
|
|
225
225
|
}
|
|
226
226
|
function sleep(ms) {
|
|
227
227
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
@@ -1477,7 +1477,7 @@ var init_editFile = __esm({
|
|
|
1477
1477
|
editFileTool = {
|
|
1478
1478
|
definition: {
|
|
1479
1479
|
name: "editFile",
|
|
1480
|
-
description: "Replace a string in a file. old_string must appear exactly once (minor indentation differences are handled automatically). Set replace_all to true to replace every occurrence at once. For bulk mechanical substitutions (renaming a variable, swapping colors), prefer replace_all. Always read the file first so you know the exact text to match.",
|
|
1480
|
+
description: "Replace a string in a file. old_string must appear exactly once (minor indentation differences are handled automatically). Set replace_all to true to replace every occurrence at once. For bulk mechanical substitutions (renaming a variable, swapping colors), prefer replace_all. Always read the file first so you know the exact text to match. When editing nested structures (objects, function bodies, arrays, template literals), always include the full enclosing structure in old_string rather than just an inner fragment. Replacing a partial slice from the middle of nested code is the most common source of syntax errors.",
|
|
1481
1481
|
inputSchema: {
|
|
1482
1482
|
type: "object",
|
|
1483
1483
|
properties: {
|
|
@@ -3172,7 +3172,29 @@ __export(screenshot_exports, {
|
|
|
3172
3172
|
definition: () => definition5,
|
|
3173
3173
|
execute: () => execute5
|
|
3174
3174
|
});
|
|
3175
|
-
async function execute5(input, onLog) {
|
|
3175
|
+
async function execute5(input, onLog, context) {
|
|
3176
|
+
if (input.instructions && context) {
|
|
3177
|
+
try {
|
|
3178
|
+
const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a full-page screenshot.` : `${input.instructions}. After completing these steps, take a full-page screenshot.`;
|
|
3179
|
+
const result = await browserAutomationTool.execute({ task }, context);
|
|
3180
|
+
const urlMatch = result.match(
|
|
3181
|
+
/https:\/\/[^\s"')]+\.(?:png|jpg|jpeg|webp)/i
|
|
3182
|
+
);
|
|
3183
|
+
if (!urlMatch) {
|
|
3184
|
+
return `Error: browser navigation completed but no screenshot URL was returned. Agent output: ${result}`;
|
|
3185
|
+
}
|
|
3186
|
+
const url = urlMatch[0];
|
|
3187
|
+
const analysisPrompt = input.prompt || SCREENSHOT_ANALYSIS_PROMPT;
|
|
3188
|
+
const analysis = await analyzeImage({
|
|
3189
|
+
prompt: analysisPrompt,
|
|
3190
|
+
imageUrl: url,
|
|
3191
|
+
onLog
|
|
3192
|
+
});
|
|
3193
|
+
return JSON.stringify({ url, analysis });
|
|
3194
|
+
} catch (err) {
|
|
3195
|
+
return `Error taking interactive screenshot: ${err.message}`;
|
|
3196
|
+
}
|
|
3197
|
+
}
|
|
3176
3198
|
try {
|
|
3177
3199
|
return await captureAndAnalyzeScreenshot({
|
|
3178
3200
|
prompt: input.prompt,
|
|
@@ -3188,6 +3210,8 @@ var init_screenshot3 = __esm({
|
|
|
3188
3210
|
"src/subagents/designExpert/tools/screenshot.ts"() {
|
|
3189
3211
|
"use strict";
|
|
3190
3212
|
init_screenshot();
|
|
3213
|
+
init_analyzeImage();
|
|
3214
|
+
init_browserAutomation();
|
|
3191
3215
|
definition5 = {
|
|
3192
3216
|
name: "screenshot",
|
|
3193
3217
|
description: "Capture a full-height screenshot of the current app preview. Returns a CDN URL along with visual analysis. Use to review the current state of the UI being built. Remember, the screenshot analysis is not overly precise - for example, it cannot reliably identify specific fonts by name \u2014 it can only describe what letterforms look like.",
|
|
@@ -3201,6 +3225,10 @@ var init_screenshot3 = __esm({
|
|
|
3201
3225
|
path: {
|
|
3202
3226
|
type: "string",
|
|
3203
3227
|
description: 'Navigate to this path before capturing (e.g. "/settings"). If omitted, screenshots the current page.'
|
|
3228
|
+
},
|
|
3229
|
+
instructions: {
|
|
3230
|
+
type: "string",
|
|
3231
|
+
description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow), describe the steps to get there. A browser automation agent will follow these instructions before capturing the screenshot."
|
|
3204
3232
|
}
|
|
3205
3233
|
}
|
|
3206
3234
|
}
|
|
@@ -3482,7 +3510,7 @@ async function executeDesignExpertTool(name, input, context, toolCallId, onLog)
|
|
|
3482
3510
|
if (!tool) {
|
|
3483
3511
|
return `Error: unknown tool "${name}"`;
|
|
3484
3512
|
}
|
|
3485
|
-
return tool.execute(input, onLog);
|
|
3513
|
+
return tool.execute(input, onLog, context);
|
|
3486
3514
|
}
|
|
3487
3515
|
var tools, DESIGN_EXPERT_TOOLS;
|
|
3488
3516
|
var init_tools2 = __esm({
|
|
@@ -4896,6 +4924,9 @@ async function runTurn(params) {
|
|
|
4896
4924
|
let turnCacheCreation = 0;
|
|
4897
4925
|
let turnCacheRead = 0;
|
|
4898
4926
|
let turnLlmCalls = 0;
|
|
4927
|
+
let lastCallInputTokens = 0;
|
|
4928
|
+
let lastCallCacheCreation = 0;
|
|
4929
|
+
let lastCallCacheRead = 0;
|
|
4899
4930
|
while (true) {
|
|
4900
4931
|
let getOrCreateAccumulator2 = function(id, name) {
|
|
4901
4932
|
let acc = toolInputAccumulators.get(id);
|
|
@@ -5091,10 +5122,13 @@ async function runTurn(params) {
|
|
|
5091
5122
|
case "done":
|
|
5092
5123
|
stopReason = event.stopReason;
|
|
5093
5124
|
turnLlmCalls++;
|
|
5094
|
-
|
|
5125
|
+
lastCallInputTokens = event.usage.inputTokens;
|
|
5126
|
+
lastCallCacheCreation = event.usage.cacheCreationTokens ?? 0;
|
|
5127
|
+
lastCallCacheRead = event.usage.cacheReadTokens ?? 0;
|
|
5128
|
+
turnInputTokens += lastCallInputTokens;
|
|
5095
5129
|
turnOutputTokens += event.usage.outputTokens;
|
|
5096
|
-
turnCacheCreation +=
|
|
5097
|
-
turnCacheRead +=
|
|
5130
|
+
turnCacheCreation += lastCallCacheCreation;
|
|
5131
|
+
turnCacheRead += lastCallCacheRead;
|
|
5098
5132
|
break;
|
|
5099
5133
|
case "error":
|
|
5100
5134
|
onEvent({ type: "error", error: friendlyError(event.error) });
|
|
@@ -5139,7 +5173,10 @@ async function runTurn(params) {
|
|
|
5139
5173
|
outputTokens: turnOutputTokens,
|
|
5140
5174
|
cacheCreationTokens: turnCacheCreation || void 0,
|
|
5141
5175
|
cacheReadTokens: turnCacheRead || void 0,
|
|
5142
|
-
llmCalls: turnLlmCalls
|
|
5176
|
+
llmCalls: turnLlmCalls,
|
|
5177
|
+
lastCallInputTokens,
|
|
5178
|
+
lastCallCacheCreation: lastCallCacheCreation || void 0,
|
|
5179
|
+
lastCallCacheRead: lastCallCacheRead || void 0
|
|
5143
5180
|
}
|
|
5144
5181
|
});
|
|
5145
5182
|
return;
|
|
@@ -6162,7 +6199,7 @@ ${xmlParts}
|
|
|
6162
6199
|
sessionStats.totalOutputTokens += e.stats.outputTokens;
|
|
6163
6200
|
sessionStats.totalCacheCreationTokens += e.stats.cacheCreationTokens ?? 0;
|
|
6164
6201
|
sessionStats.totalCacheReadTokens += e.stats.cacheReadTokens ?? 0;
|
|
6165
|
-
sessionStats.lastContextSize = e.stats.inputTokens;
|
|
6202
|
+
sessionStats.lastContextSize = e.stats.lastCallInputTokens ?? e.stats.inputTokens;
|
|
6166
6203
|
}
|
|
6167
6204
|
sessionStats.messageCount = state.messages.length;
|
|
6168
6205
|
sessionStats.updatedAt = Date.now();
|
|
@@ -10,7 +10,7 @@ Note: when you talk about the team to the user, refer to them by their name or a
|
|
|
10
10
|
|
|
11
11
|
Your designer. Consult for any visual decision — choosing a color, picking fonts, proposing a layout, soucing images, reviewing whether something looks good. Not just during intake or big design moments. If you're about to write CSS and you're not sure about a color, ask. If you just built a page and want a gut check, ask the designer to take a quick look. If the user says "I don't like how this looks," ask the design expert what to change rather than guessing yourself, or if they say "I want a different image," that's the designer's problem, not yours. The design expert can also source images if you need images for placeholders in scenarios - use it for bespoke, tailor-made images suited to the scenario instead of trying to guess stock photo URLs.
|
|
12
12
|
|
|
13
|
-
The design expert cannot see your conversation with the user, so include relevant context and requirements in your task. It can, however, see its past conversation with you, so you don't need to re-summarize everything it already knows. Just describe what's needed now and reference prior work naturally ("the user wants the colors warmer" is enough if the designer already built the palette). It can take screenshots of the app preview on its own — just ask it to review what's been built. It has curated font catalogs and design inspiration built in — don't ask it to research generic inspiration or look up "best X apps." Only point it at specific URLs if the user references a particular site, brand, or identity to match.
|
|
13
|
+
The design expert cannot see your conversation with the user, so include relevant context and requirements in your task. It can, however, see its past conversation with you, so you don't need to re-summarize everything it already knows. Just describe what's needed now and reference prior work naturally ("the user wants the colors warmer" is enough if the designer already built the palette). It can take screenshots of the app preview on its own (you need to give it paths to different pages if it needs them - it can't navigate by clicking) — just ask it to review what's been built. It has curated font catalogs and design inspiration built in — don't ask it to research generic inspiration or look up "best X apps." Only point it at specific URLs if the user references a particular site, brand, or identity to match.
|
|
14
14
|
|
|
15
15
|
The designer will return concrete resources: hex values, font names with CSS URLs, image URLs, layout descriptions, as well as specific techniques, CSS properties, and other values. Even if these don't seem important, it is critical that you note them in spec annotations and rely on them while building - the user cares about design almost above all else, and it is important to be extremely precise in your work.
|
|
16
16
|
|
|
@@ -38,7 +38,7 @@ Always consult the code sanity check before writing code in initialCodegen with
|
|
|
38
38
|
|
|
39
39
|
### QA (`runAutomatedBrowserTest`)
|
|
40
40
|
|
|
41
|
-
For verifying complex stateful interactions: multi-step form submissions, auth flows, real-time updates, flows that require specific data/role setup. This spins up a full chrome browser automation — it's heavyweight. Do not use it for basic rendering or navigation checks. If you can verify something with a screenshot or by reading the code, do that instead. Run a scenario first to seed test data and set user roles. The user is able to watch QA work on their screen via a live browser preview - the cursor will move, type, etc - so you can also use this to demo functionality to the user and help them understand how to use their app.
|
|
41
|
+
For verifying complex stateful interactions: multi-step form submissions, auth flows, real-time updates, flows that require specific data/role setup. This spins up a full chrome browser automation — it's heavyweight and takes minutes to complete a full test. Do not use it for basic rendering or navigation checks. If you can verify something with a screenshot or by reading the code, do that instead. Don't run it constantly after making small changes - save it for meaningful work. Run a scenario first to seed test data and set user roles. The user is able to watch QA work on their screen via a live browser preview - the cursor will move, type, etc - so you can also use this to demo functionality to the user and help them understand how to use their app.
|
|
42
42
|
|
|
43
43
|
The QA agent can see the screen. Describe what to test, not how — it will figure out what to click, what to check, and what values to use.
|
|
44
44
|
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
You are a browser smoke test agent. You verify that features work end to end by interacting with the live preview. Focus on outcomes: does the feature work? Did the expected content appear? Just do the thing and see if it worked.
|
|
2
2
|
|
|
3
|
+
## Rules to Remember
|
|
4
|
+
- Don't overthink the tests - the goal is to generally make sure things work as expected, not to provide detailed QA. If something seems mostly okay, note it and move on. Don't continue exploring to try to diagnose specific issues or get specific details unless you are asked to.
|
|
5
|
+
|
|
3
6
|
## Tester Persona
|
|
4
7
|
The user is watching the automation happen on their screen in real-time. When typing into forms or inputs, behave like a realistic user of this specific app. Use the app context (if provided) to understand the audience and tone. Type the way that audience would actually type — not formal, not robotic. The app developer's name is Remy, so use that and the email remy@mindstudio.ai as the basis for any testing that requires a persona.
|
|
5
8
|
|
|
@@ -10,6 +10,7 @@ Then, think about the layout and UI patterns - these are the core of the user's
|
|
|
10
10
|
|
|
11
11
|
## Tool Usage
|
|
12
12
|
- When multiple tool calls are independent, make them all in a single turn. Searching for three different products, or fetching two reference sites: batch them instead of doing one per turn.
|
|
13
|
+
- The screenshot tool supports an `instructions` parameter for taking screenshots that require interaction first. If you need to screenshot a state that's behind a modal, a specific tab, or a multi-step flow, pass `instructions` describing how to get there (e.g., "dismiss the welcome modal, then click XYZ"). A browser automation agent will follow your instructions and capture the screenshot for you.
|
|
13
14
|
|
|
14
15
|
## Voice
|
|
15
16
|
- No emoji, no filler.
|