@mindstudio-ai/remy 0.1.13 → 0.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/headless.js CHANGED
@@ -2258,6 +2258,14 @@ var BROWSER_TOOLS = [
2258
2258
  type: "object",
2259
2259
  properties: {}
2260
2260
  }
2261
+ },
2262
+ {
2263
+ name: "resetBrowser",
2264
+ description: "Reset the browser to a clean state. Call this once after all tests are complete to restore the preview for the user. Fire and forget \u2014 does not wait for the reload to finish.",
2265
+ inputSchema: {
2266
+ type: "object",
2267
+ properties: {}
2268
+ }
2261
2269
  }
2262
2270
  ];
2263
2271
  var BROWSER_EXTERNAL_TOOLS = /* @__PURE__ */ new Set(["browserCommand", "screenshot"]);
@@ -2307,7 +2315,17 @@ var browserAutomationTool = {
2307
2315
  task: input.task,
2308
2316
  tools: BROWSER_TOOLS,
2309
2317
  externalTools: BROWSER_EXTERNAL_TOOLS,
2310
- executeTool: async () => "Error: no local tools in browser automation",
2318
+ executeTool: async (name) => {
2319
+ if (name === "resetBrowser") {
2320
+ try {
2321
+ await sidecarRequest("/reset-browser", {}, { timeout: 5e3 });
2322
+ return "Browser reset triggered.";
2323
+ } catch {
2324
+ return "Error: could not reset browser.";
2325
+ }
2326
+ }
2327
+ return `Error: unknown local tool "${name}"`;
2328
+ },
2311
2329
  apiConfig: context.apiConfig,
2312
2330
  model: context.model,
2313
2331
  signal: context.signal,
@@ -2462,6 +2480,32 @@ var DESIGN_RESEARCH_TOOLS = [
2462
2480
  },
2463
2481
  required: ["prompts"]
2464
2482
  }
2483
+ },
2484
+ {
2485
+ name: "editImage",
2486
+ description: "Edit an existing image using a text instruction. Takes a source image URL and a prompt describing the edits (color grading, style transfer, modifications, adding/removing elements). Returns a new CDN URL.",
2487
+ inputSchema: {
2488
+ type: "object",
2489
+ properties: {
2490
+ imageUrl: {
2491
+ type: "string",
2492
+ description: "URL of the source image to edit."
2493
+ },
2494
+ prompt: {
2495
+ type: "string",
2496
+ description: 'What to change. Describe the edit as an instruction: "apply warm golden hour color grading", "make the background darker", "add a subtle film grain texture".'
2497
+ },
2498
+ width: {
2499
+ type: "number",
2500
+ description: "Output width in pixels. Default 2048. Range: 2048-4096."
2501
+ },
2502
+ height: {
2503
+ type: "number",
2504
+ description: "Output height in pixels. Default 2048. Range: 2048-4096."
2505
+ }
2506
+ },
2507
+ required: ["imageUrl", "prompt"]
2508
+ }
2465
2509
  }
2466
2510
  ];
2467
2511
  function runCli(cmd) {
@@ -2507,37 +2551,17 @@ async function executeDesignTool(name, input) {
2507
2551
  `mindstudio analyze-image --prompt ${JSON.stringify(DESIGN_REFERENCE_PROMPT)} --image-url ${JSON.stringify(input.imageUrl)} --no-meta`
2508
2552
  );
2509
2553
  case "screenshotAndAnalyze": {
2510
- const screenshotResult = await runCli(
2511
- `mindstudio scrape-url --url ${JSON.stringify(input.url)} --page-options ${JSON.stringify(JSON.stringify({ onlyMainContent: true, screenshot: true }))} --no-meta`
2554
+ const ssUrl = await runCli(
2555
+ `mindstudio screenshot-url --url ${JSON.stringify(input.url)} --mode viewport --width 1440 --delay 2000 --output-key screenshotUrl --no-meta`
2512
2556
  );
2513
- const screenshotMatch = screenshotResult.match(
2514
- /https:\/\/[^\s"']+(?:\.png|\.jpg|\.jpeg|\.webp|screenshot[^\s"']*)/i
2515
- );
2516
- if (!screenshotMatch) {
2517
- try {
2518
- const parsed = JSON.parse(screenshotResult);
2519
- const ssUrl = parsed.screenshot || parsed.screenshotUrl || parsed.content?.screenshotUrl;
2520
- if (ssUrl) {
2521
- const analysisPrompt2 = input.prompt || DESIGN_REFERENCE_PROMPT;
2522
- const analysis2 = await runCli(
2523
- `mindstudio analyze-image --prompt ${JSON.stringify(analysisPrompt2)} --image-url ${JSON.stringify(ssUrl)} --no-meta`
2524
- );
2525
- return `Screenshot: ${ssUrl}
2526
-
2527
- ${analysis2}`;
2528
- }
2529
- } catch {
2530
- }
2531
- return `Fetched ${input.url} but could not extract screenshot URL.
2532
-
2533
- Page content:
2534
- ${screenshotResult}`;
2557
+ if (ssUrl.startsWith("Error")) {
2558
+ return `Could not screenshot ${input.url}: ${ssUrl}`;
2535
2559
  }
2536
2560
  const analysisPrompt = input.prompt || DESIGN_REFERENCE_PROMPT;
2537
2561
  const analysis = await runCli(
2538
- `mindstudio analyze-image --prompt ${JSON.stringify(analysisPrompt)} --image-url ${JSON.stringify(screenshotMatch[0])} --no-meta`
2562
+ `mindstudio analyze-image --prompt ${JSON.stringify(analysisPrompt)} --image-url ${JSON.stringify(ssUrl)} --no-meta`
2539
2563
  );
2540
- return `Screenshot: ${screenshotMatch[0]}
2564
+ return `Screenshot: ${ssUrl}
2541
2565
 
2542
2566
  ${analysis}`;
2543
2567
  }
@@ -2581,6 +2605,24 @@ ${analysis}`;
2581
2605
  }));
2582
2606
  return runCli(`mindstudio batch '${JSON.stringify(steps)}' --no-meta`);
2583
2607
  }
2608
+ case "editImage": {
2609
+ const width = input.width || 2048;
2610
+ const height = input.height || 2048;
2611
+ const step = JSON.stringify({
2612
+ prompt: input.prompt,
2613
+ imageModelOverride: {
2614
+ model: "seedream-4.5",
2615
+ config: {
2616
+ images: [input.imageUrl],
2617
+ width,
2618
+ height
2619
+ }
2620
+ }
2621
+ });
2622
+ return runCli(
2623
+ `mindstudio generate-image '${step}' --output-key imageUrl --no-meta`
2624
+ );
2625
+ }
2584
2626
  default:
2585
2627
  return `Error: unknown tool "${name}"`;
2586
2628
  }
@@ -2666,7 +2708,7 @@ ${pairingList}
2666
2708
  const inspirationSection = images.length ? `<inspiration_images>
2667
2709
  ## Design inspiration
2668
2710
 
2669
- A random sample of pre-analyzed design references. Use these observations to inform your recommendations and build something creative, unique, and compelling.
2711
+ This is what the bar looks like. These are real sites that made it onto curated design galleries because they did something bold, intentional, and memorable. Study the moves they make \u2014 the confident color choices, the unexpected layouts, the typography that carries the whole page. Your recommendations should feel like they belong in this company.
2670
2712
 
2671
2713
  ${imageList}
2672
2714
  </inspiration_images>` : "";
@@ -2687,8 +2729,8 @@ The visual design expert can be used for all things visual design, from quick qu
2687
2729
  - Layout and composition ideas that go beyond generic AI defaults
2688
2730
  - Analyzing a reference site or screenshot for design insights (it can take screenshots and do research on its own)
2689
2731
  - Beautiful layout images or photos
2690
- - Icon recommendations
2691
- - Proposing full visual directions during intake
2732
+ - Icon recommendations or AI image editing
2733
+ - Proposing full visual design and layout directions during intake
2692
2734
 
2693
2735
  **How to write the task:**
2694
2736
  Include context about the app \u2014 what it does, who uses it, what mood or feeling the interface should convey. If the user has any specific requirements, be sure to include them. The agent can not see your conversation with the user, so you need to include all details. More context produces better results. For quick questions ("three font pairings for a <x> app"), brief is fine. You can ask for multiple topics, multiple options, etc.
@@ -3195,6 +3237,8 @@ async function runTurn(params) {
3195
3237
  });
3196
3238
  }
3197
3239
  state.messages.push(userMsg);
3240
+ let lastCompletedTools = "";
3241
+ let lastCompletedResult = "";
3198
3242
  while (true) {
3199
3243
  let getOrCreateAccumulator2 = function(id, name) {
3200
3244
  let acc = toolInputAccumulators.get(id);
@@ -3281,7 +3325,8 @@ async function runTurn(params) {
3281
3325
  apiConfig,
3282
3326
  getContext: () => ({
3283
3327
  assistantText: assistantText.slice(-500),
3284
- lastToolName: toolCalls.at(-1)?.name
3328
+ lastToolName: toolCalls.at(-1)?.name || lastCompletedTools || void 0,
3329
+ lastToolResult: lastCompletedResult || void 0
3285
3330
  }),
3286
3331
  onStatus: (label) => onEvent({ type: "status", message: label }),
3287
3332
  signal
@@ -3410,15 +3455,6 @@ async function runTurn(params) {
3410
3455
  count: toolCalls.length,
3411
3456
  tools: toolCalls.map((tc) => tc.name)
3412
3457
  });
3413
- const toolStatusWatcher = startStatusWatcher({
3414
- apiConfig,
3415
- getContext: () => ({
3416
- assistantText: assistantText.slice(-500),
3417
- lastToolName: toolCalls.map((tc) => tc.name).join(", ")
3418
- }),
3419
- onStatus: (label) => onEvent({ type: "status", message: label }),
3420
- signal
3421
- });
3422
3458
  const results = await Promise.all(
3423
3459
  toolCalls.map(async (tc) => {
3424
3460
  if (signal?.aborted) {
@@ -3476,7 +3512,8 @@ async function runTurn(params) {
3476
3512
  }
3477
3513
  })
3478
3514
  );
3479
- toolStatusWatcher.stop();
3515
+ lastCompletedTools = toolCalls.map((tc) => tc.name).join(", ");
3516
+ lastCompletedResult = results.at(-1)?.result ?? "";
3480
3517
  for (const r of results) {
3481
3518
  state.messages.push({
3482
3519
  role: "user",
package/dist/index.js CHANGED
@@ -2208,6 +2208,14 @@ var init_tools = __esm({
2208
2208
  type: "object",
2209
2209
  properties: {}
2210
2210
  }
2211
+ },
2212
+ {
2213
+ name: "resetBrowser",
2214
+ description: "Reset the browser to a clean state. Call this once after all tests are complete to restore the preview for the user. Fire and forget \u2014 does not wait for the reload to finish.",
2215
+ inputSchema: {
2216
+ type: "object",
2217
+ properties: {}
2218
+ }
2211
2219
  }
2212
2220
  ];
2213
2221
  BROWSER_EXTERNAL_TOOLS = /* @__PURE__ */ new Set(["browserCommand", "screenshot"]);
@@ -2273,7 +2281,17 @@ var init_browserAutomation = __esm({
2273
2281
  task: input.task,
2274
2282
  tools: BROWSER_TOOLS,
2275
2283
  externalTools: BROWSER_EXTERNAL_TOOLS,
2276
- executeTool: async () => "Error: no local tools in browser automation",
2284
+ executeTool: async (name) => {
2285
+ if (name === "resetBrowser") {
2286
+ try {
2287
+ await sidecarRequest("/reset-browser", {}, { timeout: 5e3 });
2288
+ return "Browser reset triggered.";
2289
+ } catch {
2290
+ return "Error: could not reset browser.";
2291
+ }
2292
+ }
2293
+ return `Error: unknown local tool "${name}"`;
2294
+ },
2277
2295
  apiConfig: context.apiConfig,
2278
2296
  model: context.model,
2279
2297
  signal: context.signal,
@@ -2331,37 +2349,17 @@ async function executeDesignTool(name, input) {
2331
2349
  `mindstudio analyze-image --prompt ${JSON.stringify(DESIGN_REFERENCE_PROMPT)} --image-url ${JSON.stringify(input.imageUrl)} --no-meta`
2332
2350
  );
2333
2351
  case "screenshotAndAnalyze": {
2334
- const screenshotResult = await runCli(
2335
- `mindstudio scrape-url --url ${JSON.stringify(input.url)} --page-options ${JSON.stringify(JSON.stringify({ onlyMainContent: true, screenshot: true }))} --no-meta`
2336
- );
2337
- const screenshotMatch = screenshotResult.match(
2338
- /https:\/\/[^\s"']+(?:\.png|\.jpg|\.jpeg|\.webp|screenshot[^\s"']*)/i
2352
+ const ssUrl = await runCli(
2353
+ `mindstudio screenshot-url --url ${JSON.stringify(input.url)} --mode viewport --width 1440 --delay 2000 --output-key screenshotUrl --no-meta`
2339
2354
  );
2340
- if (!screenshotMatch) {
2341
- try {
2342
- const parsed = JSON.parse(screenshotResult);
2343
- const ssUrl = parsed.screenshot || parsed.screenshotUrl || parsed.content?.screenshotUrl;
2344
- if (ssUrl) {
2345
- const analysisPrompt2 = input.prompt || DESIGN_REFERENCE_PROMPT;
2346
- const analysis2 = await runCli(
2347
- `mindstudio analyze-image --prompt ${JSON.stringify(analysisPrompt2)} --image-url ${JSON.stringify(ssUrl)} --no-meta`
2348
- );
2349
- return `Screenshot: ${ssUrl}
2350
-
2351
- ${analysis2}`;
2352
- }
2353
- } catch {
2354
- }
2355
- return `Fetched ${input.url} but could not extract screenshot URL.
2356
-
2357
- Page content:
2358
- ${screenshotResult}`;
2355
+ if (ssUrl.startsWith("Error")) {
2356
+ return `Could not screenshot ${input.url}: ${ssUrl}`;
2359
2357
  }
2360
2358
  const analysisPrompt = input.prompt || DESIGN_REFERENCE_PROMPT;
2361
2359
  const analysis = await runCli(
2362
- `mindstudio analyze-image --prompt ${JSON.stringify(analysisPrompt)} --image-url ${JSON.stringify(screenshotMatch[0])} --no-meta`
2360
+ `mindstudio analyze-image --prompt ${JSON.stringify(analysisPrompt)} --image-url ${JSON.stringify(ssUrl)} --no-meta`
2363
2361
  );
2364
- return `Screenshot: ${screenshotMatch[0]}
2362
+ return `Screenshot: ${ssUrl}
2365
2363
 
2366
2364
  ${analysis}`;
2367
2365
  }
@@ -2405,6 +2403,24 @@ ${analysis}`;
2405
2403
  }));
2406
2404
  return runCli(`mindstudio batch '${JSON.stringify(steps)}' --no-meta`);
2407
2405
  }
2406
+ case "editImage": {
2407
+ const width = input.width || 2048;
2408
+ const height = input.height || 2048;
2409
+ const step = JSON.stringify({
2410
+ prompt: input.prompt,
2411
+ imageModelOverride: {
2412
+ model: "seedream-4.5",
2413
+ config: {
2414
+ images: [input.imageUrl],
2415
+ width,
2416
+ height
2417
+ }
2418
+ }
2419
+ });
2420
+ return runCli(
2421
+ `mindstudio generate-image '${step}' --output-key imageUrl --no-meta`
2422
+ );
2423
+ }
2408
2424
  default:
2409
2425
  return `Error: unknown tool "${name}"`;
2410
2426
  }
@@ -2555,6 +2571,32 @@ Be specific and concise.`;
2555
2571
  },
2556
2572
  required: ["prompts"]
2557
2573
  }
2574
+ },
2575
+ {
2576
+ name: "editImage",
2577
+ description: "Edit an existing image using a text instruction. Takes a source image URL and a prompt describing the edits (color grading, style transfer, modifications, adding/removing elements). Returns a new CDN URL.",
2578
+ inputSchema: {
2579
+ type: "object",
2580
+ properties: {
2581
+ imageUrl: {
2582
+ type: "string",
2583
+ description: "URL of the source image to edit."
2584
+ },
2585
+ prompt: {
2586
+ type: "string",
2587
+ description: 'What to change. Describe the edit as an instruction: "apply warm golden hour color grading", "make the background darker", "add a subtle film grain texture".'
2588
+ },
2589
+ width: {
2590
+ type: "number",
2591
+ description: "Output width in pixels. Default 2048. Range: 2048-4096."
2592
+ },
2593
+ height: {
2594
+ type: "number",
2595
+ description: "Output height in pixels. Default 2048. Range: 2048-4096."
2596
+ }
2597
+ },
2598
+ required: ["imageUrl", "prompt"]
2599
+ }
2558
2600
  }
2559
2601
  ];
2560
2602
  }
@@ -2623,7 +2665,7 @@ ${pairingList}
2623
2665
  const inspirationSection = images.length ? `<inspiration_images>
2624
2666
  ## Design inspiration
2625
2667
 
2626
- A random sample of pre-analyzed design references. Use these observations to inform your recommendations and build something creative, unique, and compelling.
2668
+ This is what the bar looks like. These are real sites that made it onto curated design galleries because they did something bold, intentional, and memorable. Study the moves they make \u2014 the confident color choices, the unexpected layouts, the typography that carries the whole page. Your recommendations should feel like they belong in this company.
2627
2669
 
2628
2670
  ${imageList}
2629
2671
  </inspiration_images>` : "";
@@ -2674,8 +2716,8 @@ The visual design expert can be used for all things visual design, from quick qu
2674
2716
  - Layout and composition ideas that go beyond generic AI defaults
2675
2717
  - Analyzing a reference site or screenshot for design insights (it can take screenshots and do research on its own)
2676
2718
  - Beautiful layout images or photos
2677
- - Icon recommendations
2678
- - Proposing full visual directions during intake
2719
+ - Icon recommendations or AI image editing
2720
+ - Proposing full visual design and layout directions during intake
2679
2721
 
2680
2722
  **How to write the task:**
2681
2723
  Include context about the app \u2014 what it does, who uses it, what mood or feeling the interface should convey. If the user has any specific requirements, be sure to include them. The agent can not see your conversation with the user, so you need to include all details. More context produces better results. For quick questions ("three font pairings for a <x> app"), brief is fine. You can ask for multiple topics, multiple options, etc.
@@ -3230,6 +3272,8 @@ async function runTurn(params) {
3230
3272
  });
3231
3273
  }
3232
3274
  state.messages.push(userMsg);
3275
+ let lastCompletedTools = "";
3276
+ let lastCompletedResult = "";
3233
3277
  while (true) {
3234
3278
  let getOrCreateAccumulator2 = function(id, name) {
3235
3279
  let acc = toolInputAccumulators.get(id);
@@ -3316,7 +3360,8 @@ async function runTurn(params) {
3316
3360
  apiConfig,
3317
3361
  getContext: () => ({
3318
3362
  assistantText: assistantText.slice(-500),
3319
- lastToolName: toolCalls.at(-1)?.name
3363
+ lastToolName: toolCalls.at(-1)?.name || lastCompletedTools || void 0,
3364
+ lastToolResult: lastCompletedResult || void 0
3320
3365
  }),
3321
3366
  onStatus: (label) => onEvent({ type: "status", message: label }),
3322
3367
  signal
@@ -3445,15 +3490,6 @@ async function runTurn(params) {
3445
3490
  count: toolCalls.length,
3446
3491
  tools: toolCalls.map((tc) => tc.name)
3447
3492
  });
3448
- const toolStatusWatcher = startStatusWatcher({
3449
- apiConfig,
3450
- getContext: () => ({
3451
- assistantText: assistantText.slice(-500),
3452
- lastToolName: toolCalls.map((tc) => tc.name).join(", ")
3453
- }),
3454
- onStatus: (label) => onEvent({ type: "status", message: label }),
3455
- signal
3456
- });
3457
3493
  const results = await Promise.all(
3458
3494
  toolCalls.map(async (tc) => {
3459
3495
  if (signal?.aborted) {
@@ -3511,7 +3547,8 @@ async function runTurn(params) {
3511
3547
  }
3512
3548
  })
3513
3549
  );
3514
- toolStatusWatcher.stop();
3550
+ lastCompletedTools = toolCalls.map((tc) => tc.name).join(", ");
3551
+ lastCompletedResult = results.at(-1)?.result ?? "";
3515
3552
  for (const r of results) {
3516
3553
  state.messages.push({
3517
3554
  role: "user",
@@ -22,9 +22,19 @@ Start from these four and extend as needed. Add interface specs for other interf
22
22
 
23
23
  Users often care about look and feel as much as (or more than) underlying data structures. Don't treat the brand and interface specs as an afterthought — for many users, the visual identity and voice are the first things they want to get right.
24
24
 
25
- Write specs in natural, human language. Describe what the app does the way you'd explain it to a colleague. The spec rendered with annotations hidden is a human-forward document that anyone can read. The spec with annotations visible is the agent-forward document that drives code generation. Keep the prose clean and readable — technical details like column types, status values, and implementation hints belong in annotations, not in the prose.
25
+ Write specs in natural, human language. Describe what the app does the way you'd explain it to a colleague. The spec rendered with annotations hidden is a human-forward document that anyone can read. The spec with annotations visible is the agent-forward document that drives code generation. Keep the prose clean and readable — technical details like column types, status values, CSS properties, code snippets, and implementation hints belong in annotations, not in the prose.
26
26
 
27
- When you have image URLs (from the design expert, stock photos, or AI generation), embed them directly in the spec using markdown image syntax (`![description](url)`). The spec should be a visual document if there's a hero image, a background photo, or a generated graphic, include it inline so the user can see it and the coding agent can reference it during build.
27
+ When you have image URLs (from the design expert, stock photos, or AI generation), embed them directly in the spec using markdown image syntax. Write descriptive alt text that captures what the image actually depicts (this helps accessibility and helps the coding agent understand the image without loading it). Use the surrounding prose to explain the design intent what the image is for, how it should be used in the layout, and why it was chosen.
28
+
29
+ ```markdown
30
+ ### Hero Section
31
+
32
+ The hero uses a full-bleed editorial photograph. The image should be used as
33
+ a background with the headline overlaid where there's negative space.
34
+
35
+ ![Editorial portrait, warm golden hour lighting, person looking out over a
36
+ city skyline, shallow depth of field, shot on 85mm](https://i.mscdn.ai/...)
37
+ ```
28
38
 
29
39
  **Refining with the user:**
30
40
  After writing the first draft, guide the user through it. Don't just ask "does this look good?" — the user is seeing a multi-section spec for the first time.
@@ -22,9 +22,19 @@ Start from these four and extend as needed. Add interface specs for other interf
22
22
 
23
23
  Users often care about look and feel as much as (or more than) underlying data structures. Don't treat the brand and interface specs as an afterthought — for many users, the visual identity and voice are the first things they want to get right.
24
24
 
25
- Write specs in natural, human language. Describe what the app does the way you'd explain it to a colleague. The spec rendered with annotations hidden is a human-forward document that anyone can read. The spec with annotations visible is the agent-forward document that drives code generation. Keep the prose clean and readable — technical details like column types, status values, and implementation hints belong in annotations, not in the prose.
25
+ Write specs in natural, human language. Describe what the app does the way you'd explain it to a colleague. The spec rendered with annotations hidden is a human-forward document that anyone can read. The spec with annotations visible is the agent-forward document that drives code generation. Keep the prose clean and readable — technical details like column types, status values, CSS properties, code snippets, and implementation hints belong in annotations, not in the prose.
26
26
 
27
- When you have image URLs (from the design expert, stock photos, or AI generation), embed them directly in the spec using markdown image syntax (`![description](url)`). The spec should be a visual document if there's a hero image, a background photo, or a generated graphic, include it inline so the user can see it and the coding agent can reference it during build.
27
+ When you have image URLs (from the design expert, stock photos, or AI generation), embed them directly in the spec using markdown image syntax. Write descriptive alt text that captures what the image actually depicts (this helps accessibility and helps the coding agent understand the image without loading it). Use the surrounding prose to explain the design intent what the image is for, how it should be used in the layout, and why it was chosen.
28
+
29
+ ```markdown
30
+ ### Hero Section
31
+
32
+ The hero uses a full-bleed editorial photograph. The image should be used as
33
+ a background with the headline overlaid where there's negative space.
34
+
35
+ ![Editorial portrait, warm golden hour lighting, person looking out over a
36
+ city skyline, shallow depth of field, shot on 85mm](https://i.mscdn.ai/...)
37
+ ```
28
38
 
29
39
  **Refining with the user:**
30
40
  After writing the first draft, guide the user through it. Don't just ask "does this look good?" — the user is seeing a multi-section spec for the first time.
@@ -95,6 +95,7 @@ Check a count with evaluate:
95
95
  - evaluate auto-returns simple expressions. `"script": "document.title"` works directly. For multi-statement scripts, use explicit return.
96
96
  - The snapshot in the response is always the most current page state. Even if a wait times out, check the snapshot field; the content you were waiting for may have appeared by then.
97
97
  - Execution stops on first error. If step 2 of 5 fails, steps 3-5 don't run. The response will contain results for steps 0-2 (with step 2 having an error field) plus the current snapshot. Adjust and retry from the failed step.
98
+ - Always call `resetBrowser` as your final action after all tests are complete. This restores the preview to a clean state for the user.
98
99
  </rules>
99
100
 
100
101
  <voice>
@@ -6,37 +6,66 @@ Not every interface needs images. A productivity dashboard, a finance tool, or a
6
6
 
7
7
  Do not provide images as "references" - images must be ready-to-use assets that can be included directly in the design.
8
8
 
9
- ### Two sources
9
+ ### Three tools
10
10
 
11
11
  **AI-generated photos and images** (`generateImages`) — Seedream produces high-quality results for both photorealistic images and abstract/creative visuals. You have full control over the output: style, composition, colors, mood. When generating multiple images, batch them in a single `generateImages` call — they run in parallel. Generated images are production assets, not mockups or concepts — they are hosted on MindStudio CDN at full resolution and will be used directly in the final interface.
12
12
 
13
- **Stock photography** (`searchStockPhotos`) — Pexels has modern, editorial-style photos. Useful for quick placeholders, mockups, or when you need a specific real-world subject (a specific city, a recognizable object, etc.). Write specific queries: "person writing in notebook at minimalist desk, natural light" not "office."
13
+ **Image editing** (`editImage`) — takes an existing image URL and a text instruction describing what to change. Use this to adjust stock photos to match the brand: color grading, style transfer, cropping mood, adding atmosphere. Find a great stock photo, then edit it to align with the design direction.
14
+
15
+ **Stock photography** (`searchStockPhotos`) — Pexels has modern, editorial-style photos. Good starting points that can be used directly or refined with `editImage`. Write specific queries: "person writing in notebook at minimalist desk, natural light" not "office."
14
16
 
15
17
  ### Writing good generation prompts
16
18
 
17
- Lead with the visual style, then describe the content. This order helps the model establish the look before filling in details.
19
+ Write prompts as natural sentences describing a scene, not as comma-separated keyword lists. Describe what a camera would see, not art direction instructions.
20
+
21
+ **Structure:** Subject and action first, then setting, then style and technical details. Include the intended use when relevant.
22
+
23
+ - "A woman laughing while reading on a sun-drenched balcony overlooking a Mediterranean harbor. Editorial photography, shot on Kodak Portra 400, 85mm lens at f/2, soft golden hour light, shallow depth of field. For a lifestyle app hero section."
24
+ - "An overhead view of a cluttered designer's desk with fabric swatches, sketches, and a coffee cup. Natural window light from the left, slightly desaturated tones, Canon 5D with 35mm lens. For an about page."
25
+ - "Smooth organic shapes in deep navy and warm amber, flowing liquid forms with subtle grain texture. Abstract digital art, high contrast, editorial feel."
26
+
27
+ **Photography vocabulary produces the best results.** The model responds strongly to specific references:
28
+ - Film stocks: Kodak Portra, Fuji Superia, Cinestill 800T, expired film
29
+ - Lenses: 85mm f/1.4, 35mm wide angle, 50mm Summilux, macro
30
+ - Lighting: golden hour, chiaroscuro, tungsten warmth, soft diffused studio light, direct flash
31
+ - Shot types: close-up, overhead flat lay, low angle, eye-level candid, aerial
32
+ - Techniques: shallow depth of field, halation around highlights, film grain, motion blur
18
33
 
19
- **Structure:** Style/medium first, then subject, then details.
20
- - "Digital photography, soft natural window light, shallow depth of field. A ceramic coffee cup on a marble countertop, morning light casting long shadows, warm tones."
21
- - "Flat vector illustration, clean lines, limited color palette. An isometric view of a workspace with a laptop, plant, and notebook."
22
- - "Abstract digital art, fluid gradients, high contrast. Deep navy flowing into warm amber, organic liquid shapes, editorial feel."
34
+ **Declare the medium early.** Saying "editorial photograph" vs "watercolor painting" vs "3D render" doesn't just change style — it changes the model's entire approach to composition, color, and detail. Set this expectation in the first sentence.
23
35
 
24
- **For photorealistic images:** Specify the photography style (editorial, portrait, product, aerial), lighting (natural, studio, golden hour, direct flash), and camera characteristics (close-up, wide angle, shallow depth of field, slightly grainy texture).
36
+ **For text in images**, wrap the exact text in double quotes and specify the style: `A neon sign reading "OPEN" in cursive pink lettering against a dark brick wall.`
37
+
38
+ **Compose for the layout.** If you know the image will have text overlaid, request space for it: "negative space in the upper left for headline text" or "clean sky area above the subject." If it's a background, consider "centered subject with clean margins." The first few words of the prompt carry the most weight — lead with the medium and subject.
25
39
 
26
40
  **Avoid:**
27
41
  - Hex codes in prompts — the model renders them as visible text. Describe colors by name instead.
28
- - Describing positions of arms, legs, or specific limb arrangements this confuses image models.
42
+ - Keyword lists separated by commaswrite sentences.
43
+ - Describing positions of arms, legs, or specific limb arrangements.
44
+ - Conflicting style instructions ("photorealistic cartoon").
45
+ - Describing what you don't want — say "empty street" not "street with no cars."
46
+ - Mentioning "text" or "text placement" in prompts — the model will try to render text. Request the composition you want ("negative space in the left third") without saying why.
47
+ - Brand names (camera brands, font names, company names) can get rendered as visible text. Use technical specs ("medium format, 120mm lens") instead of brand names ("Hasselblad") when possible.
48
+ - UI component language — "glass morphism effect", "card design", "button with hover state". Write prompts as if briefing a photographer or artist, not describing CSS.
49
+ - Generating text that should be HTML. Headlines, body copy, CTAs, and any text the user needs to read or interact with belongs in the markup, not baked into an image. Text *within a scene* is fine — a neon sign, a logo on a t-shirt, text on a billboard in a cityscape, an app screen in a device mockup. That's part of the visual content.
50
+
51
+ ### How generated images work in the UI
52
+
53
+ Every generated image is a full rectangular frame — a photograph, a poster, a painting, a texture. The image generator does not produce isolated elements, transparent PNGs, or UI components. The coding agent controls how images are used: cropping, blending, overlaying, masking with CSS.
54
+
55
+ This means you can generate a dramatic texture and the coding agent uses it as a card background with a blend mode. You can generate an editorial photo and the coding agent overlays text on it for a hero section. Think of yourself as providing visual ingredients, not finished UI.
29
56
 
30
57
  ### What makes good photos and images
31
58
 
32
- Think about what would actually appear on this page if a real design team made it. Photos and images should have real subjects that connect to the product's story — people, places, objects, scenes. You can make things that are truly beautiful. Generic abstract visuals are the AI image equivalent of purple gradients: safe, meaningless, forgettable. Push for images with specificity, strong subjects, and emotional resonance.
59
+ It's 2026. Everything is lifestyle and editorial. Even a landing page for a productivity tool or a SaaS product should feel like a magazine spread, not a tech blog. The era of sterile stock-photo-of-a-laptop-on-a-desk is over. People respond to beautiful, dramatic, emotionally resonant imagery.
60
+
61
+ Default to photography with real subjects — people, scenes, moments, environments. Use editorial and fashion photography vocabulary in your prompts. When abstract art is the right call (textures, editorial collages, gradient art), make it bold and intentional, not generic gradient blobs.
62
+
63
+ The coding agent should never need to source its own imagery. Always provide URLs.
33
64
 
34
65
  ### When to use images
35
66
 
36
67
  Include image recommendations in your designs when the product calls for it. A landing page without photography feels like a wireframe. A feature section with a real image feels finished. When proposing layouts, specify where images go and what they should depict — don't leave it to the coding agent to figure out.
37
68
 
38
- The coding agent should never need to source its own imagery. Always provide URLs.
39
-
40
69
  ### CDN image transforms
41
70
 
42
71
  Generated images and uploaded images are hosted on `i.mscdn.ai`. Use query string parameters to request appropriately sized images rather than CSS-scaling full-resolution originals:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mindstudio-ai/remy",
3
- "version": "0.1.13",
3
+ "version": "0.1.15",
4
4
  "description": "MindStudio coding agent",
5
5
  "repository": {
6
6
  "type": "git",