@mindstudio-ai/remy 0.1.12 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/headless.js CHANGED
@@ -86,7 +86,7 @@ function resolveConfig(flags) {
86
86
  const activeEnv = file.environment || "prod";
87
87
  const env = file.environments?.[activeEnv];
88
88
  const apiKey = flags?.apiKey || process.env.MINDSTUDIO_API_KEY || env?.apiKey || "";
89
- const baseUrl = flags?.baseUrl || process.env.MINDSTUDIO_BASE_URL || env?.apiBaseUrl || DEFAULT_BASE_URL;
89
+ const baseUrl2 = flags?.baseUrl || process.env.MINDSTUDIO_BASE_URL || env?.apiBaseUrl || DEFAULT_BASE_URL;
90
90
  if (!apiKey) {
91
91
  log.error("No API key found");
92
92
  throw new Error(
@@ -95,52 +95,60 @@ function resolveConfig(flags) {
95
95
  }
96
96
  const keySource = flags?.apiKey ? "cli flag" : process.env.MINDSTUDIO_API_KEY ? "env var" : "config file";
97
97
  log.info("Config resolved", {
98
- baseUrl,
98
+ baseUrl: baseUrl2,
99
99
  keySource,
100
100
  environment: activeEnv
101
101
  });
102
- return { apiKey, baseUrl };
102
+ return { apiKey, baseUrl: baseUrl2 };
103
103
  }
104
104
 
105
105
  // src/prompt/index.ts
106
106
  import fs4 from "fs";
107
107
  import path3 from "path";
108
108
 
109
- // src/tools/_helpers/lsp.ts
110
- var lspBaseUrl = null;
111
- function setLspBaseUrl(url) {
112
- lspBaseUrl = url;
113
- log.info("LSP configured", { url });
109
+ // src/tools/_helpers/sidecar.ts
110
+ var baseUrl = null;
111
+ function setSidecarBaseUrl(url) {
112
+ baseUrl = url;
113
+ log.info("Sidecar configured", { url });
114
114
  }
115
- function isLspConfigured() {
116
- return lspBaseUrl !== null;
115
+ function isSidecarConfigured() {
116
+ return baseUrl !== null;
117
117
  }
118
- async function lspRequest(endpoint, body) {
119
- if (!lspBaseUrl) {
120
- throw new Error("LSP not available");
118
+ async function sidecarRequest(endpoint, body = {}, options) {
119
+ if (!baseUrl) {
120
+ throw new Error("Sidecar not available");
121
121
  }
122
- const url = `${lspBaseUrl}${endpoint}`;
123
- log.debug("LSP request", { endpoint, body });
122
+ const url = `${baseUrl}${endpoint}`;
123
+ log.debug("Sidecar request", { endpoint, body });
124
124
  try {
125
125
  const res = await fetch(url, {
126
126
  method: "POST",
127
127
  headers: { "Content-Type": "application/json" },
128
- body: JSON.stringify(body)
128
+ body: JSON.stringify(body),
129
+ signal: options?.timeout ? AbortSignal.timeout(options.timeout) : void 0
129
130
  });
130
131
  if (!res.ok) {
131
- log.error("LSP sidecar error", { endpoint, status: res.status });
132
- throw new Error(`LSP sidecar error: ${res.status}`);
132
+ log.error("Sidecar error", { endpoint, status: res.status });
133
+ throw new Error(`Sidecar error: ${res.status}`);
133
134
  }
134
135
  return res.json();
135
136
  } catch (err) {
136
- if (err.message.startsWith("LSP sidecar")) {
137
+ if (err.message.startsWith("Sidecar error")) {
137
138
  throw err;
138
139
  }
139
- log.error("LSP connection error", { endpoint, error: err.message });
140
- throw new Error(`LSP connection error: ${err.message}`);
140
+ log.error("Sidecar connection error", { endpoint, error: err.message });
141
+ throw new Error(`Sidecar connection error: ${err.message}`);
141
142
  }
142
143
  }
143
144
 
145
+ // src/tools/_helpers/lsp.ts
146
+ var setLspBaseUrl = setSidecarBaseUrl;
147
+ var isLspConfigured = isSidecarConfigured;
148
+ async function lspRequest(endpoint, body) {
149
+ return sidecarRequest(endpoint, body);
150
+ }
151
+
144
152
  // src/prompt/static/projectContext.ts
145
153
  import fs3 from "fs";
146
154
  import path2 from "path";
@@ -395,8 +403,8 @@ ${viewContext?.activeFile ? `Active file: ${viewContext.activeFile}` : ""}
395
403
 
396
404
  // src/api.ts
397
405
  async function* streamChat(params) {
398
- const { baseUrl, apiKey, signal, ...body } = params;
399
- const url = `${baseUrl}/_internal/v2/agent/remy/chat`;
406
+ const { baseUrl: baseUrl2, apiKey, signal, ...body } = params;
407
+ const url = `${baseUrl2}/_internal/v2/agent/remy/chat`;
400
408
  const startTime = Date.now();
401
409
  const messagesWithAttachments = body.messages.filter(
402
410
  (m) => m.attachments && m.attachments.length > 0
@@ -2250,6 +2258,14 @@ var BROWSER_TOOLS = [
2250
2258
  type: "object",
2251
2259
  properties: {}
2252
2260
  }
2261
+ },
2262
+ {
2263
+ name: "resetBrowser",
2264
+ description: "Reset the browser to a clean state. Call this once after all tests are complete to restore the preview for the user. Fire and forget \u2014 does not wait for the reload to finish.",
2265
+ inputSchema: {
2266
+ type: "object",
2267
+ properties: {}
2268
+ }
2253
2269
  }
2254
2270
  ];
2255
2271
  var BROWSER_EXTERNAL_TOOLS = /* @__PURE__ */ new Set(["browserCommand", "screenshot"]);
@@ -2282,12 +2298,34 @@ var browserAutomationTool = {
2282
2298
  if (!context) {
2283
2299
  return "Error: browser automation requires execution context (only available in headless mode)";
2284
2300
  }
2301
+ try {
2302
+ const status = await sidecarRequest(
2303
+ "/browser-status",
2304
+ {},
2305
+ { timeout: 5e3 }
2306
+ );
2307
+ if (!status.connected) {
2308
+ return "Error: the browser preview is not connected. The user needs to open the preview before browser tests can run.";
2309
+ }
2310
+ } catch {
2311
+ return "Error: could not check browser status. The dev environment may not be running.";
2312
+ }
2285
2313
  return runSubAgent({
2286
2314
  system: BROWSER_AUTOMATION_PROMPT,
2287
2315
  task: input.task,
2288
2316
  tools: BROWSER_TOOLS,
2289
2317
  externalTools: BROWSER_EXTERNAL_TOOLS,
2290
- executeTool: async () => "Error: no local tools in browser automation",
2318
+ executeTool: async (name) => {
2319
+ if (name === "resetBrowser") {
2320
+ try {
2321
+ await sidecarRequest("/reset-browser", {}, { timeout: 5e3 });
2322
+ return "Browser reset triggered.";
2323
+ } catch {
2324
+ return "Error: could not reset browser.";
2325
+ }
2326
+ }
2327
+ return `Error: unknown local tool "${name}"`;
2328
+ },
2291
2329
  apiConfig: context.apiConfig,
2292
2330
  model: context.model,
2293
2331
  signal: context.signal,
@@ -2442,6 +2480,32 @@ var DESIGN_RESEARCH_TOOLS = [
2442
2480
  },
2443
2481
  required: ["prompts"]
2444
2482
  }
2483
+ },
2484
+ {
2485
+ name: "editImage",
2486
+ description: "Edit an existing image using a text instruction. Takes a source image URL and a prompt describing the edits (color grading, style transfer, modifications, adding/removing elements). Returns a new CDN URL.",
2487
+ inputSchema: {
2488
+ type: "object",
2489
+ properties: {
2490
+ imageUrl: {
2491
+ type: "string",
2492
+ description: "URL of the source image to edit."
2493
+ },
2494
+ prompt: {
2495
+ type: "string",
2496
+ description: 'What to change. Describe the edit as an instruction: "apply warm golden hour color grading", "make the background darker", "add a subtle film grain texture".'
2497
+ },
2498
+ width: {
2499
+ type: "number",
2500
+ description: "Output width in pixels. Default 2048. Range: 2048-4096."
2501
+ },
2502
+ height: {
2503
+ type: "number",
2504
+ description: "Output height in pixels. Default 2048. Range: 2048-4096."
2505
+ }
2506
+ },
2507
+ required: ["imageUrl", "prompt"]
2508
+ }
2445
2509
  }
2446
2510
  ];
2447
2511
  function runCli(cmd) {
@@ -2487,37 +2551,17 @@ async function executeDesignTool(name, input) {
2487
2551
  `mindstudio analyze-image --prompt ${JSON.stringify(DESIGN_REFERENCE_PROMPT)} --image-url ${JSON.stringify(input.imageUrl)} --no-meta`
2488
2552
  );
2489
2553
  case "screenshotAndAnalyze": {
2490
- const screenshotResult = await runCli(
2491
- `mindstudio scrape-url --url ${JSON.stringify(input.url)} --page-options ${JSON.stringify(JSON.stringify({ onlyMainContent: true, screenshot: true }))} --no-meta`
2554
+ const ssUrl = await runCli(
2555
+ `mindstudio screenshot-url --url ${JSON.stringify(input.url)} --mode viewport --width 1440 --delay 2000 --output-key screenshotUrl --no-meta`
2492
2556
  );
2493
- const screenshotMatch = screenshotResult.match(
2494
- /https:\/\/[^\s"']+(?:\.png|\.jpg|\.jpeg|\.webp|screenshot[^\s"']*)/i
2495
- );
2496
- if (!screenshotMatch) {
2497
- try {
2498
- const parsed = JSON.parse(screenshotResult);
2499
- const ssUrl = parsed.screenshot || parsed.screenshotUrl || parsed.content?.screenshotUrl;
2500
- if (ssUrl) {
2501
- const analysisPrompt2 = input.prompt || DESIGN_REFERENCE_PROMPT;
2502
- const analysis2 = await runCli(
2503
- `mindstudio analyze-image --prompt ${JSON.stringify(analysisPrompt2)} --image-url ${JSON.stringify(ssUrl)} --no-meta`
2504
- );
2505
- return `Screenshot: ${ssUrl}
2506
-
2507
- ${analysis2}`;
2508
- }
2509
- } catch {
2510
- }
2511
- return `Fetched ${input.url} but could not extract screenshot URL.
2512
-
2513
- Page content:
2514
- ${screenshotResult}`;
2557
+ if (ssUrl.startsWith("Error")) {
2558
+ return `Could not screenshot ${input.url}: ${ssUrl}`;
2515
2559
  }
2516
2560
  const analysisPrompt = input.prompt || DESIGN_REFERENCE_PROMPT;
2517
2561
  const analysis = await runCli(
2518
- `mindstudio analyze-image --prompt ${JSON.stringify(analysisPrompt)} --image-url ${JSON.stringify(screenshotMatch[0])} --no-meta`
2562
+ `mindstudio analyze-image --prompt ${JSON.stringify(analysisPrompt)} --image-url ${JSON.stringify(ssUrl)} --no-meta`
2519
2563
  );
2520
- return `Screenshot: ${screenshotMatch[0]}
2564
+ return `Screenshot: ${ssUrl}
2521
2565
 
2522
2566
  ${analysis}`;
2523
2567
  }
@@ -2561,6 +2605,24 @@ ${analysis}`;
2561
2605
  }));
2562
2606
  return runCli(`mindstudio batch '${JSON.stringify(steps)}' --no-meta`);
2563
2607
  }
2608
+ case "editImage": {
2609
+ const width = input.width || 2048;
2610
+ const height = input.height || 2048;
2611
+ const step = JSON.stringify({
2612
+ prompt: input.prompt,
2613
+ imageModelOverride: {
2614
+ model: "seedream-4.5",
2615
+ config: {
2616
+ images: [input.imageUrl],
2617
+ width,
2618
+ height
2619
+ }
2620
+ }
2621
+ });
2622
+ return runCli(
2623
+ `mindstudio generate-image '${step}' --output-key imageUrl --no-meta`
2624
+ );
2625
+ }
2564
2626
  default:
2565
2627
  return `Error: unknown tool "${name}"`;
2566
2628
  }
@@ -2646,7 +2708,7 @@ ${pairingList}
2646
2708
  const inspirationSection = images.length ? `<inspiration_images>
2647
2709
  ## Design inspiration
2648
2710
 
2649
- A random sample of pre-analyzed design references. Use these observations to inform your recommendations and build something creative, unique, and compelling.
2711
+ This is what the bar looks like. These are real sites that made it onto curated design galleries because they did something bold, intentional, and memorable. Study the moves they make \u2014 the confident color choices, the unexpected layouts, the typography that carries the whole page. Your recommendations should feel like they belong in this company.
2650
2712
 
2651
2713
  ${imageList}
2652
2714
  </inspiration_images>` : "";
@@ -2667,8 +2729,8 @@ The visual design expert can be used for all things visual design, from quick qu
2667
2729
  - Layout and composition ideas that go beyond generic AI defaults
2668
2730
  - Analyzing a reference site or screenshot for design insights (it can take screenshots and do research on its own)
2669
2731
  - Beautiful layout images or photos
2670
- - Icon recommendations
2671
- - Proposing full visual directions during intake
2732
+ - Icon recommendations or AI image editing
2733
+ - Proposing full visual design and layout directions during intake
2672
2734
 
2673
2735
  **How to write the task:**
2674
2736
  Include context about the app \u2014 what it does, who uses it, what mood or feeling the interface should convey. If the user has any specific requirements, be sure to include them. The agent can not see your conversation with the user, so you need to include all details. More context produces better results. For quick questions ("three font pairings for a <x> app"), brief is fine. You can ask for multiple topics, multiple options, etc.
@@ -3175,6 +3237,8 @@ async function runTurn(params) {
3175
3237
  });
3176
3238
  }
3177
3239
  state.messages.push(userMsg);
3240
+ let lastCompletedTools = "";
3241
+ let lastCompletedResult = "";
3178
3242
  while (true) {
3179
3243
  let getOrCreateAccumulator2 = function(id, name) {
3180
3244
  let acc = toolInputAccumulators.get(id);
@@ -3261,7 +3325,8 @@ async function runTurn(params) {
3261
3325
  apiConfig,
3262
3326
  getContext: () => ({
3263
3327
  assistantText: assistantText.slice(-500),
3264
- lastToolName: toolCalls.at(-1)?.name
3328
+ lastToolName: toolCalls.at(-1)?.name || lastCompletedTools || void 0,
3329
+ lastToolResult: lastCompletedResult || void 0
3265
3330
  }),
3266
3331
  onStatus: (label) => onEvent({ type: "status", message: label }),
3267
3332
  signal
@@ -3390,15 +3455,6 @@ async function runTurn(params) {
3390
3455
  count: toolCalls.length,
3391
3456
  tools: toolCalls.map((tc) => tc.name)
3392
3457
  });
3393
- const toolStatusWatcher = startStatusWatcher({
3394
- apiConfig,
3395
- getContext: () => ({
3396
- assistantText: assistantText.slice(-500),
3397
- lastToolName: toolCalls.map((tc) => tc.name).join(", ")
3398
- }),
3399
- onStatus: (label) => onEvent({ type: "status", message: label }),
3400
- signal
3401
- });
3402
3458
  const results = await Promise.all(
3403
3459
  toolCalls.map(async (tc) => {
3404
3460
  if (signal?.aborted) {
@@ -3456,7 +3512,8 @@ async function runTurn(params) {
3456
3512
  }
3457
3513
  })
3458
3514
  );
3459
- toolStatusWatcher.stop();
3515
+ lastCompletedTools = toolCalls.map((tc) => tc.name).join(", ");
3516
+ lastCompletedResult = results.at(-1)?.result ?? "";
3460
3517
  for (const r of results) {
3461
3518
  state.messages.push({
3462
3519
  role: "user",
package/dist/index.js CHANGED
@@ -89,8 +89,8 @@ var init_logger = __esm({
89
89
 
90
90
  // src/api.ts
91
91
  async function* streamChat(params) {
92
- const { baseUrl, apiKey, signal, ...body } = params;
93
- const url = `${baseUrl}/_internal/v2/agent/remy/chat`;
92
+ const { baseUrl: baseUrl2, apiKey, signal, ...body } = params;
93
+ const url = `${baseUrl2}/_internal/v2/agent/remy/chat`;
94
94
  const startTime = Date.now();
95
95
  const messagesWithAttachments = body.messages.filter(
96
96
  (m) => m.attachments && m.attachments.length > 0
@@ -1776,45 +1776,60 @@ var init_editsFinished = __esm({
1776
1776
  }
1777
1777
  });
1778
1778
 
1779
- // src/tools/_helpers/lsp.ts
1780
- function setLspBaseUrl(url) {
1781
- lspBaseUrl = url;
1782
- log.info("LSP configured", { url });
1779
+ // src/tools/_helpers/sidecar.ts
1780
+ function setSidecarBaseUrl(url) {
1781
+ baseUrl = url;
1782
+ log.info("Sidecar configured", { url });
1783
1783
  }
1784
- function isLspConfigured() {
1785
- return lspBaseUrl !== null;
1784
+ function isSidecarConfigured() {
1785
+ return baseUrl !== null;
1786
1786
  }
1787
- async function lspRequest(endpoint, body) {
1788
- if (!lspBaseUrl) {
1789
- throw new Error("LSP not available");
1787
+ async function sidecarRequest(endpoint, body = {}, options) {
1788
+ if (!baseUrl) {
1789
+ throw new Error("Sidecar not available");
1790
1790
  }
1791
- const url = `${lspBaseUrl}${endpoint}`;
1792
- log.debug("LSP request", { endpoint, body });
1791
+ const url = `${baseUrl}${endpoint}`;
1792
+ log.debug("Sidecar request", { endpoint, body });
1793
1793
  try {
1794
1794
  const res = await fetch(url, {
1795
1795
  method: "POST",
1796
1796
  headers: { "Content-Type": "application/json" },
1797
- body: JSON.stringify(body)
1797
+ body: JSON.stringify(body),
1798
+ signal: options?.timeout ? AbortSignal.timeout(options.timeout) : void 0
1798
1799
  });
1799
1800
  if (!res.ok) {
1800
- log.error("LSP sidecar error", { endpoint, status: res.status });
1801
- throw new Error(`LSP sidecar error: ${res.status}`);
1801
+ log.error("Sidecar error", { endpoint, status: res.status });
1802
+ throw new Error(`Sidecar error: ${res.status}`);
1802
1803
  }
1803
1804
  return res.json();
1804
1805
  } catch (err) {
1805
- if (err.message.startsWith("LSP sidecar")) {
1806
+ if (err.message.startsWith("Sidecar error")) {
1806
1807
  throw err;
1807
1808
  }
1808
- log.error("LSP connection error", { endpoint, error: err.message });
1809
- throw new Error(`LSP connection error: ${err.message}`);
1809
+ log.error("Sidecar connection error", { endpoint, error: err.message });
1810
+ throw new Error(`Sidecar connection error: ${err.message}`);
1810
1811
  }
1811
1812
  }
1812
- var lspBaseUrl;
1813
+ var baseUrl;
1814
+ var init_sidecar = __esm({
1815
+ "src/tools/_helpers/sidecar.ts"() {
1816
+ "use strict";
1817
+ init_logger();
1818
+ baseUrl = null;
1819
+ }
1820
+ });
1821
+
1822
+ // src/tools/_helpers/lsp.ts
1823
+ async function lspRequest(endpoint, body) {
1824
+ return sidecarRequest(endpoint, body);
1825
+ }
1826
+ var setLspBaseUrl, isLspConfigured;
1813
1827
  var init_lsp = __esm({
1814
1828
  "src/tools/_helpers/lsp.ts"() {
1815
1829
  "use strict";
1816
- init_logger();
1817
- lspBaseUrl = null;
1830
+ init_sidecar();
1831
+ setLspBaseUrl = setSidecarBaseUrl;
1832
+ isLspConfigured = isSidecarConfigured;
1818
1833
  }
1819
1834
  });
1820
1835
 
@@ -2193,6 +2208,14 @@ var init_tools = __esm({
2193
2208
  type: "object",
2194
2209
  properties: {}
2195
2210
  }
2211
+ },
2212
+ {
2213
+ name: "resetBrowser",
2214
+ description: "Reset the browser to a clean state. Call this once after all tests are complete to restore the preview for the user. Fire and forget \u2014 does not wait for the reload to finish.",
2215
+ inputSchema: {
2216
+ type: "object",
2217
+ properties: {}
2218
+ }
2196
2219
  }
2197
2220
  ];
2198
2221
  BROWSER_EXTERNAL_TOOLS = /* @__PURE__ */ new Set(["browserCommand", "screenshot"]);
@@ -2221,6 +2244,7 @@ var init_browserAutomation = __esm({
2221
2244
  init_runner();
2222
2245
  init_tools();
2223
2246
  init_prompt();
2247
+ init_sidecar();
2224
2248
  browserAutomationTool = {
2225
2249
  definition: {
2226
2250
  name: "runAutomatedBrowserTest",
@@ -2240,12 +2264,34 @@ var init_browserAutomation = __esm({
2240
2264
  if (!context) {
2241
2265
  return "Error: browser automation requires execution context (only available in headless mode)";
2242
2266
  }
2267
+ try {
2268
+ const status = await sidecarRequest(
2269
+ "/browser-status",
2270
+ {},
2271
+ { timeout: 5e3 }
2272
+ );
2273
+ if (!status.connected) {
2274
+ return "Error: the browser preview is not connected. The user needs to open the preview before browser tests can run.";
2275
+ }
2276
+ } catch {
2277
+ return "Error: could not check browser status. The dev environment may not be running.";
2278
+ }
2243
2279
  return runSubAgent({
2244
2280
  system: BROWSER_AUTOMATION_PROMPT,
2245
2281
  task: input.task,
2246
2282
  tools: BROWSER_TOOLS,
2247
2283
  externalTools: BROWSER_EXTERNAL_TOOLS,
2248
- executeTool: async () => "Error: no local tools in browser automation",
2284
+ executeTool: async (name) => {
2285
+ if (name === "resetBrowser") {
2286
+ try {
2287
+ await sidecarRequest("/reset-browser", {}, { timeout: 5e3 });
2288
+ return "Browser reset triggered.";
2289
+ } catch {
2290
+ return "Error: could not reset browser.";
2291
+ }
2292
+ }
2293
+ return `Error: unknown local tool "${name}"`;
2294
+ },
2249
2295
  apiConfig: context.apiConfig,
2250
2296
  model: context.model,
2251
2297
  signal: context.signal,
@@ -2303,37 +2349,17 @@ async function executeDesignTool(name, input) {
2303
2349
  `mindstudio analyze-image --prompt ${JSON.stringify(DESIGN_REFERENCE_PROMPT)} --image-url ${JSON.stringify(input.imageUrl)} --no-meta`
2304
2350
  );
2305
2351
  case "screenshotAndAnalyze": {
2306
- const screenshotResult = await runCli(
2307
- `mindstudio scrape-url --url ${JSON.stringify(input.url)} --page-options ${JSON.stringify(JSON.stringify({ onlyMainContent: true, screenshot: true }))} --no-meta`
2352
+ const ssUrl = await runCli(
2353
+ `mindstudio screenshot-url --url ${JSON.stringify(input.url)} --mode viewport --width 1440 --delay 2000 --output-key screenshotUrl --no-meta`
2308
2354
  );
2309
- const screenshotMatch = screenshotResult.match(
2310
- /https:\/\/[^\s"']+(?:\.png|\.jpg|\.jpeg|\.webp|screenshot[^\s"']*)/i
2311
- );
2312
- if (!screenshotMatch) {
2313
- try {
2314
- const parsed = JSON.parse(screenshotResult);
2315
- const ssUrl = parsed.screenshot || parsed.screenshotUrl || parsed.content?.screenshotUrl;
2316
- if (ssUrl) {
2317
- const analysisPrompt2 = input.prompt || DESIGN_REFERENCE_PROMPT;
2318
- const analysis2 = await runCli(
2319
- `mindstudio analyze-image --prompt ${JSON.stringify(analysisPrompt2)} --image-url ${JSON.stringify(ssUrl)} --no-meta`
2320
- );
2321
- return `Screenshot: ${ssUrl}
2322
-
2323
- ${analysis2}`;
2324
- }
2325
- } catch {
2326
- }
2327
- return `Fetched ${input.url} but could not extract screenshot URL.
2328
-
2329
- Page content:
2330
- ${screenshotResult}`;
2355
+ if (ssUrl.startsWith("Error")) {
2356
+ return `Could not screenshot ${input.url}: ${ssUrl}`;
2331
2357
  }
2332
2358
  const analysisPrompt = input.prompt || DESIGN_REFERENCE_PROMPT;
2333
2359
  const analysis = await runCli(
2334
- `mindstudio analyze-image --prompt ${JSON.stringify(analysisPrompt)} --image-url ${JSON.stringify(screenshotMatch[0])} --no-meta`
2360
+ `mindstudio analyze-image --prompt ${JSON.stringify(analysisPrompt)} --image-url ${JSON.stringify(ssUrl)} --no-meta`
2335
2361
  );
2336
- return `Screenshot: ${screenshotMatch[0]}
2362
+ return `Screenshot: ${ssUrl}
2337
2363
 
2338
2364
  ${analysis}`;
2339
2365
  }
@@ -2377,6 +2403,24 @@ ${analysis}`;
2377
2403
  }));
2378
2404
  return runCli(`mindstudio batch '${JSON.stringify(steps)}' --no-meta`);
2379
2405
  }
2406
+ case "editImage": {
2407
+ const width = input.width || 2048;
2408
+ const height = input.height || 2048;
2409
+ const step = JSON.stringify({
2410
+ prompt: input.prompt,
2411
+ imageModelOverride: {
2412
+ model: "seedream-4.5",
2413
+ config: {
2414
+ images: [input.imageUrl],
2415
+ width,
2416
+ height
2417
+ }
2418
+ }
2419
+ });
2420
+ return runCli(
2421
+ `mindstudio generate-image '${step}' --output-key imageUrl --no-meta`
2422
+ );
2423
+ }
2380
2424
  default:
2381
2425
  return `Error: unknown tool "${name}"`;
2382
2426
  }
@@ -2527,6 +2571,32 @@ Be specific and concise.`;
2527
2571
  },
2528
2572
  required: ["prompts"]
2529
2573
  }
2574
+ },
2575
+ {
2576
+ name: "editImage",
2577
+ description: "Edit an existing image using a text instruction. Takes a source image URL and a prompt describing the edits (color grading, style transfer, modifications, adding/removing elements). Returns a new CDN URL.",
2578
+ inputSchema: {
2579
+ type: "object",
2580
+ properties: {
2581
+ imageUrl: {
2582
+ type: "string",
2583
+ description: "URL of the source image to edit."
2584
+ },
2585
+ prompt: {
2586
+ type: "string",
2587
+ description: 'What to change. Describe the edit as an instruction: "apply warm golden hour color grading", "make the background darker", "add a subtle film grain texture".'
2588
+ },
2589
+ width: {
2590
+ type: "number",
2591
+ description: "Output width in pixels. Default 2048. Range: 2048-4096."
2592
+ },
2593
+ height: {
2594
+ type: "number",
2595
+ description: "Output height in pixels. Default 2048. Range: 2048-4096."
2596
+ }
2597
+ },
2598
+ required: ["imageUrl", "prompt"]
2599
+ }
2530
2600
  }
2531
2601
  ];
2532
2602
  }
@@ -2595,7 +2665,7 @@ ${pairingList}
2595
2665
  const inspirationSection = images.length ? `<inspiration_images>
2596
2666
  ## Design inspiration
2597
2667
 
2598
- A random sample of pre-analyzed design references. Use these observations to inform your recommendations and build something creative, unique, and compelling.
2668
+ This is what the bar looks like. These are real sites that made it onto curated design galleries because they did something bold, intentional, and memorable. Study the moves they make \u2014 the confident color choices, the unexpected layouts, the typography that carries the whole page. Your recommendations should feel like they belong in this company.
2599
2669
 
2600
2670
  ${imageList}
2601
2671
  </inspiration_images>` : "";
@@ -2646,8 +2716,8 @@ The visual design expert can be used for all things visual design, from quick qu
2646
2716
  - Layout and composition ideas that go beyond generic AI defaults
2647
2717
  - Analyzing a reference site or screenshot for design insights (it can take screenshots and do research on its own)
2648
2718
  - Beautiful layout images or photos
2649
- - Icon recommendations
2650
- - Proposing full visual directions during intake
2719
+ - Icon recommendations or AI image editing
2720
+ - Proposing full visual design and layout directions during intake
2651
2721
 
2652
2722
  **How to write the task:**
2653
2723
  Include context about the app \u2014 what it does, who uses it, what mood or feeling the interface should convey. If the user has any specific requirements, be sure to include them. The agent can not see your conversation with the user, so you need to include all details. More context produces better results. For quick questions ("three font pairings for a <x> app"), brief is fine. You can ask for multiple topics, multiple options, etc.
@@ -3202,6 +3272,8 @@ async function runTurn(params) {
3202
3272
  });
3203
3273
  }
3204
3274
  state.messages.push(userMsg);
3275
+ let lastCompletedTools = "";
3276
+ let lastCompletedResult = "";
3205
3277
  while (true) {
3206
3278
  let getOrCreateAccumulator2 = function(id, name) {
3207
3279
  let acc = toolInputAccumulators.get(id);
@@ -3288,7 +3360,8 @@ async function runTurn(params) {
3288
3360
  apiConfig,
3289
3361
  getContext: () => ({
3290
3362
  assistantText: assistantText.slice(-500),
3291
- lastToolName: toolCalls.at(-1)?.name
3363
+ lastToolName: toolCalls.at(-1)?.name || lastCompletedTools || void 0,
3364
+ lastToolResult: lastCompletedResult || void 0
3292
3365
  }),
3293
3366
  onStatus: (label) => onEvent({ type: "status", message: label }),
3294
3367
  signal
@@ -3417,15 +3490,6 @@ async function runTurn(params) {
3417
3490
  count: toolCalls.length,
3418
3491
  tools: toolCalls.map((tc) => tc.name)
3419
3492
  });
3420
- const toolStatusWatcher = startStatusWatcher({
3421
- apiConfig,
3422
- getContext: () => ({
3423
- assistantText: assistantText.slice(-500),
3424
- lastToolName: toolCalls.map((tc) => tc.name).join(", ")
3425
- }),
3426
- onStatus: (label) => onEvent({ type: "status", message: label }),
3427
- signal
3428
- });
3429
3493
  const results = await Promise.all(
3430
3494
  toolCalls.map(async (tc) => {
3431
3495
  if (signal?.aborted) {
@@ -3483,7 +3547,8 @@ async function runTurn(params) {
3483
3547
  }
3484
3548
  })
3485
3549
  );
3486
- toolStatusWatcher.stop();
3550
+ lastCompletedTools = toolCalls.map((tc) => tc.name).join(", ");
3551
+ lastCompletedResult = results.at(-1)?.result ?? "";
3487
3552
  for (const r of results) {
3488
3553
  state.messages.push({
3489
3554
  role: "user",
@@ -3817,7 +3882,7 @@ function resolveConfig(flags2) {
3817
3882
  const activeEnv = file.environment || "prod";
3818
3883
  const env = file.environments?.[activeEnv];
3819
3884
  const apiKey = flags2?.apiKey || process.env.MINDSTUDIO_API_KEY || env?.apiKey || "";
3820
- const baseUrl = flags2?.baseUrl || process.env.MINDSTUDIO_BASE_URL || env?.apiBaseUrl || DEFAULT_BASE_URL;
3885
+ const baseUrl2 = flags2?.baseUrl || process.env.MINDSTUDIO_BASE_URL || env?.apiBaseUrl || DEFAULT_BASE_URL;
3821
3886
  if (!apiKey) {
3822
3887
  log.error("No API key found");
3823
3888
  throw new Error(
@@ -3826,11 +3891,11 @@ function resolveConfig(flags2) {
3826
3891
  }
3827
3892
  const keySource = flags2?.apiKey ? "cli flag" : process.env.MINDSTUDIO_API_KEY ? "env var" : "config file";
3828
3893
  log.info("Config resolved", {
3829
- baseUrl,
3894
+ baseUrl: baseUrl2,
3830
3895
  keySource,
3831
3896
  environment: activeEnv
3832
3897
  });
3833
- return { apiKey, baseUrl };
3898
+ return { apiKey, baseUrl: baseUrl2 };
3834
3899
  }
3835
3900
  var CONFIG_PATH, DEFAULT_BASE_URL;
3836
3901
  var init_config = __esm({
@@ -22,9 +22,19 @@ Start from these four and extend as needed. Add interface specs for other interf
22
22
 
23
23
  Users often care about look and feel as much as (or more than) underlying data structures. Don't treat the brand and interface specs as an afterthought — for many users, the visual identity and voice are the first things they want to get right.
24
24
 
25
- Write specs in natural, human language. Describe what the app does the way you'd explain it to a colleague. The spec rendered with annotations hidden is a human-forward document that anyone can read. The spec with annotations visible is the agent-forward document that drives code generation. Keep the prose clean and readable — technical details like column types, status values, and implementation hints belong in annotations, not in the prose.
25
+ Write specs in natural, human language. Describe what the app does the way you'd explain it to a colleague. The spec rendered with annotations hidden is a human-forward document that anyone can read. The spec with annotations visible is the agent-forward document that drives code generation. Keep the prose clean and readable — technical details like column types, status values, CSS properties, code snippets, and implementation hints belong in annotations, not in the prose.
26
26
 
27
- When you have image URLs (from the design expert, stock photos, or AI generation), embed them directly in the spec using markdown image syntax (`![description](url)`). The spec should be a visual document if there's a hero image, a background photo, or a generated graphic, include it inline so the user can see it and the coding agent can reference it during build.
27
+ When you have image URLs (from the design expert, stock photos, or AI generation), embed them directly in the spec using markdown image syntax. Write descriptive alt text that captures what the image actually depicts (this helps accessibility and helps the coding agent understand the image without loading it). Use the surrounding prose to explain the design intent what the image is for, how it should be used in the layout, and why it was chosen.
28
+
29
+ ```markdown
30
+ ### Hero Section
31
+
32
+ The hero uses a full-bleed editorial photograph. The image should be used as
33
+ a background with the headline overlaid where there's negative space.
34
+
35
+ ![Editorial portrait, warm golden hour lighting, person looking out over a
36
+ city skyline, shallow depth of field, shot on 85mm](https://i.mscdn.ai/...)
37
+ ```
28
38
 
29
39
  **Refining with the user:**
30
40
  After writing the first draft, guide the user through it. Don't just ask "does this look good?" — the user is seeing a multi-section spec for the first time.
@@ -22,9 +22,19 @@ Start from these four and extend as needed. Add interface specs for other interf
22
22
 
23
23
  Users often care about look and feel as much as (or more than) underlying data structures. Don't treat the brand and interface specs as an afterthought — for many users, the visual identity and voice are the first things they want to get right.
24
24
 
25
- Write specs in natural, human language. Describe what the app does the way you'd explain it to a colleague. The spec rendered with annotations hidden is a human-forward document that anyone can read. The spec with annotations visible is the agent-forward document that drives code generation. Keep the prose clean and readable — technical details like column types, status values, and implementation hints belong in annotations, not in the prose.
25
+ Write specs in natural, human language. Describe what the app does the way you'd explain it to a colleague. The spec rendered with annotations hidden is a human-forward document that anyone can read. The spec with annotations visible is the agent-forward document that drives code generation. Keep the prose clean and readable — technical details like column types, status values, CSS properties, code snippets, and implementation hints belong in annotations, not in the prose.
26
26
 
27
- When you have image URLs (from the design expert, stock photos, or AI generation), embed them directly in the spec using markdown image syntax (`![description](url)`). The spec should be a visual document if there's a hero image, a background photo, or a generated graphic, include it inline so the user can see it and the coding agent can reference it during build.
27
+ When you have image URLs (from the design expert, stock photos, or AI generation), embed them directly in the spec using markdown image syntax. Write descriptive alt text that captures what the image actually depicts (this helps accessibility and helps the coding agent understand the image without loading it). Use the surrounding prose to explain the design intent what the image is for, how it should be used in the layout, and why it was chosen.
28
+
29
+ ```markdown
30
+ ### Hero Section
31
+
32
+ The hero uses a full-bleed editorial photograph. The image should be used as
33
+ a background with the headline overlaid where there's negative space.
34
+
35
+ ![Editorial portrait, warm golden hour lighting, person looking out over a
36
+ city skyline, shallow depth of field, shot on 85mm](https://i.mscdn.ai/...)
37
+ ```
28
38
 
29
39
  **Refining with the user:**
30
40
  After writing the first draft, guide the user through it. Don't just ask "does this look good?" — the user is seeing a multi-section spec for the first time.
@@ -95,6 +95,7 @@ Check a count with evaluate:
95
95
  - evaluate auto-returns simple expressions. `"script": "document.title"` works directly. For multi-statement scripts, use explicit return.
96
96
  - The snapshot in the response is always the most current page state. Even if a wait times out, check the snapshot field; the content you were waiting for may have appeared by then.
97
97
  - Execution stops on first error. If step 2 of 5 fails, steps 3-5 don't run. The response will contain results for steps 0-2 (with step 2 having an error field) plus the current snapshot. Adjust and retry from the failed step.
98
+ - Always call `resetBrowser` as your final action after all tests are complete. This restores the preview to a clean state for the user.
98
99
  </rules>
99
100
 
100
101
  <voice>
@@ -6,37 +6,64 @@ Not every interface needs images. A productivity dashboard, a finance tool, or a
6
6
 
7
7
  Do not provide images as "references" - images must be ready-to-use assets that can be included directly in the design.
8
8
 
9
- ### Two sources
9
+ ### Three tools
10
10
 
11
11
  **AI-generated photos and images** (`generateImages`) — Seedream produces high-quality results for both photorealistic images and abstract/creative visuals. You have full control over the output: style, composition, colors, mood. When generating multiple images, batch them in a single `generateImages` call — they run in parallel. Generated images are production assets, not mockups or concepts — they are hosted on MindStudio CDN at full resolution and will be used directly in the final interface.
12
12
 
13
- **Stock photography** (`searchStockPhotos`) — Pexels has modern, editorial-style photos. Useful for quick placeholders, mockups, or when you need a specific real-world subject (a specific city, a recognizable object, etc.). Write specific queries: "person writing in notebook at minimalist desk, natural light" not "office."
13
+ **Image editing** (`editImage`) — takes an existing image URL and a text instruction describing what to change. Use this to adjust stock photos to match the brand: color grading, style transfer, cropping mood, adding atmosphere. Find a great stock photo, then edit it to align with the design direction.
14
+
15
+ **Stock photography** (`searchStockPhotos`) — Pexels has modern, editorial-style photos. Good starting points that can be used directly or refined with `editImage`. Write specific queries: "person writing in notebook at minimalist desk, natural light" not "office."
14
16
 
15
17
  ### Writing good generation prompts
16
18
 
17
- Lead with the visual style, then describe the content. This order helps the model establish the look before filling in details.
19
+ Write prompts as natural sentences describing a scene, not as comma-separated keyword lists. Describe what a camera would see, not art direction instructions.
20
+
21
+ **Structure:** Subject and action first, then setting, then style and technical details. Include the intended use when relevant.
22
+
23
+ - "A woman laughing while reading on a sun-drenched balcony overlooking a Mediterranean harbor. Editorial photography, shot on Kodak Portra 400, 85mm lens at f/2, soft golden hour light, shallow depth of field. For a lifestyle app hero section."
24
+ - "An overhead view of a cluttered designer's desk with fabric swatches, sketches, and a coffee cup. Natural window light from the left, slightly desaturated tones, Canon 5D with 35mm lens. For an about page."
25
+ - "Smooth organic shapes in deep navy and warm amber, flowing liquid forms with subtle grain texture. Abstract digital art, high contrast, editorial feel."
26
+
27
+ **Photography vocabulary produces the best results.** The model responds strongly to specific references:
28
+ - Film stocks: Kodak Portra, Fuji Superia, Cinestill 800T, expired film
29
+ - Lenses: 85mm f/1.4, 35mm wide angle, 50mm Summilux, macro
30
+ - Lighting: golden hour, chiaroscuro, tungsten warmth, soft diffused studio light, direct flash
31
+ - Shot types: close-up, overhead flat lay, low angle, eye-level candid, aerial
32
+ - Techniques: shallow depth of field, halation around highlights, film grain, motion blur
18
33
 
19
- **Structure:** Style/medium first, then subject, then details.
20
- - "Digital photography, soft natural window light, shallow depth of field. A ceramic coffee cup on a marble countertop, morning light casting long shadows, warm tones."
21
- - "Flat vector illustration, clean lines, limited color palette. An isometric view of a workspace with a laptop, plant, and notebook."
22
- - "Abstract digital art, fluid gradients, high contrast. Deep navy flowing into warm amber, organic liquid shapes, editorial feel."
34
+ **Declare the medium early.** Saying "editorial photograph" vs "watercolor painting" vs "3D render" doesn't just change style — it changes the model's entire approach to composition, color, and detail. Set this expectation in the first sentence.
23
35
 
24
- **For photorealistic images:** Specify the photography style (editorial, portrait, product, aerial), lighting (natural, studio, golden hour, direct flash), and camera characteristics (close-up, wide angle, shallow depth of field, slightly grainy texture).
36
+ **For text in images**, wrap the exact text in double quotes and specify the style: `A neon sign reading "OPEN" in cursive pink lettering against a dark brick wall.`
37
+
38
+ **Compose for the layout.** If you know the image will have text overlaid, request space for it: "negative space in the upper left for headline text" or "clean sky area above the subject." If it's a background, consider "centered subject with clean margins." The first few words of the prompt carry the most weight — lead with the medium and subject.
25
39
 
26
40
  **Avoid:**
27
41
  - Hex codes in prompts — the model renders them as visible text. Describe colors by name instead.
28
- - Describing positions of arms, legs, or specific limb arrangements this confuses image models.
42
+ - Keyword lists separated by commaswrite sentences.
43
+ - Describing positions of arms, legs, or specific limb arrangements.
44
+ - Conflicting style instructions ("photorealistic cartoon").
45
+ - Describing what you don't want — say "empty street" not "street with no cars."
46
+ - UI component language — "glass morphism effect", "card design", "button with hover state". Write prompts as if briefing a photographer or artist, not describing CSS.
47
+ - Generating text that should be HTML. Headlines, body copy, CTAs, and any text the user needs to read or interact with belongs in the markup, not baked into an image. Text *within a scene* is fine — a neon sign, a logo on a t-shirt, text on a billboard in a cityscape, an app screen in a device mockup. That's part of the visual content.
48
+
49
+ ### How generated images work in the UI
50
+
51
+ Every generated image is a full rectangular frame — a photograph, a poster, a painting, a texture. The image generator does not produce isolated elements, transparent PNGs, or UI components. The coding agent controls how images are used: cropping, blending, overlaying, masking with CSS.
52
+
53
+ This means you can generate a dramatic texture and the coding agent uses it as a card background with a blend mode. You can generate an editorial photo and the coding agent overlays text on it for a hero section. Think of yourself as providing visual ingredients, not finished UI.
29
54
 
30
55
  ### What makes good photos and images
31
56
 
32
- Think about what would actually appear on this page if a real design team made it. Photos and images should have real subjects that connect to the product's story — people, places, objects, scenes. You can make things that are truly beautiful. Generic abstract visuals are the AI image equivalent of purple gradients: safe, meaningless, forgettable. Push for images with specificity, strong subjects, and emotional resonance.
57
+ It's 2026. Everything is lifestyle and editorial. Even a landing page for a productivity tool or a SaaS product should feel like a magazine spread, not a tech blog. The era of sterile stock-photo-of-a-laptop-on-a-desk is over. People respond to beautiful, dramatic, emotionally resonant imagery.
58
+
59
+ Default to photography with real subjects — people, scenes, moments, environments. Use editorial and fashion photography vocabulary in your prompts. When abstract art is the right call (textures, editorial collages, gradient art), make it bold and intentional, not generic gradient blobs.
60
+
61
+ The coding agent should never need to source its own imagery. Always provide URLs.
33
62
 
34
63
  ### When to use images
35
64
 
36
65
  Include image recommendations in your designs when the product calls for it. A landing page without photography feels like a wireframe. A feature section with a real image feels finished. When proposing layouts, specify where images go and what they should depict — don't leave it to the coding agent to figure out.
37
66
 
38
- The coding agent should never need to source its own imagery. Always provide URLs.
39
-
40
67
  ### CDN image transforms
41
68
 
42
69
  Generated images and uploaded images are hosted on `i.mscdn.ai`. Use query string parameters to request appropriately sized images rather than CSS-scaling full-resolution originals:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mindstudio-ai/remy",
3
- "version": "0.1.12",
3
+ "version": "0.1.14",
4
4
  "description": "MindStudio coding agent",
5
5
  "repository": {
6
6
  "type": "git",