@mindstudio-ai/remy 0.1.191 → 0.1.193

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/headless.js CHANGED
@@ -2846,10 +2846,14 @@ async function captureAndAnalyzeScreenshot(promptOrOptions) {
2846
2846
  let onLog;
2847
2847
  let model;
2848
2848
  let path12;
2849
+ let fullPage = true;
2849
2850
  if (typeof promptOrOptions === "object" && promptOrOptions !== null) {
2850
2851
  prompt = promptOrOptions.prompt;
2851
2852
  existingUrl = promptOrOptions.imageUrl;
2852
2853
  path12 = promptOrOptions.path;
2854
+ if (promptOrOptions.fullPage !== void 0) {
2855
+ fullPage = promptOrOptions.fullPage;
2856
+ }
2853
2857
  onLog = promptOrOptions.onLog;
2854
2858
  model = promptOrOptions.model;
2855
2859
  } else {
@@ -2861,9 +2865,9 @@ async function captureAndAnalyzeScreenshot(promptOrOptions) {
2861
2865
  url = existingUrl;
2862
2866
  } else {
2863
2867
  const ssResult = await sidecarRequest(
2864
- "/screenshot-full-page",
2868
+ fullPage ? "/screenshot-full-page" : "/screenshot-viewport",
2865
2869
  path12 ? { path: path12 } : void 0,
2866
- { timeout: 12e4 }
2870
+ { timeout: fullPage ? 12e4 : 3e4 }
2867
2871
  );
2868
2872
  url = ssResult?.url || ssResult?.screenshotUrl;
2869
2873
  if (!url) {
@@ -3773,6 +3777,20 @@ var BROWSER_TOOLS = [
3773
3777
  }
3774
3778
  }
3775
3779
  }
3780
+ },
3781
+ {
3782
+ clearable: true,
3783
+ name: "screenshotViewport",
3784
+ description: "Capture a screenshot of just the visible viewport (no full-page scroll/stitch). Returns a CDN URL with full text analysis and description. Use this when the goal is a specific section the page is currently scrolled to, rather than the whole page.",
3785
+ inputSchema: {
3786
+ type: "object",
3787
+ properties: {
3788
+ path: {
3789
+ type: "string",
3790
+ description: 'Navigate to this path before capturing (e.g. "/settings"). If omitted, screenshots the current page.'
3791
+ }
3792
+ }
3793
+ }
3776
3794
  }
3777
3795
  ];
3778
3796
  var BROWSER_EXTERNAL_TOOLS = /* @__PURE__ */ new Set(["browserCommand"]);
@@ -3892,7 +3910,7 @@ function resolveModel(surfaceId, models, fallback) {
3892
3910
 
3893
3911
  // src/subagents/browserAutomation/index.ts
3894
3912
  var log7 = createLogger("browser-automation");
3895
- async function runBrowserAutomation(task, context) {
3913
+ async function runBrowserAutomation(task, context, opts) {
3896
3914
  const release = await acquireBrowserLock();
3897
3915
  try {
3898
3916
  const result = await runSubAgent({
@@ -3916,10 +3934,11 @@ async function runBrowserAutomation(task, context) {
3916
3934
  return `Error setting up browser: ${err.message}`;
3917
3935
  }
3918
3936
  }
3919
- if (name === "screenshotFullPage") {
3937
+ if (name === "screenshotFullPage" || name === "screenshotViewport") {
3920
3938
  try {
3921
3939
  return await captureAndAnalyzeScreenshot({
3922
3940
  path: _input.path,
3941
+ fullPage: name === "screenshotFullPage",
3923
3942
  onLog,
3924
3943
  model: resolveModel(
3925
3944
  "imageAnalysis",
@@ -3995,13 +4014,15 @@ async function runBrowserAutomation(task, context) {
3995
4014
  return result2;
3996
4015
  },
3997
4016
  toolRegistry: context.toolRegistry,
3998
- captureArtifacts: ["screenshotFullPage"]
4017
+ captureArtifacts: ["screenshotFullPage", "screenshotViewport"]
3999
4018
  });
4000
4019
  context.subAgentMessages?.set(context.toolCallId, result.messages);
4001
- const ss = result.artifacts?.screenshotFullPage;
4020
+ const viewport = result.artifacts?.screenshotViewport;
4021
+ const fullPage = result.artifacts?.screenshotFullPage;
4022
+ const preferred = opts?.capture === "viewport" ? viewport ?? fullPage : fullPage ?? viewport;
4002
4023
  return {
4003
4024
  text: result.text,
4004
- ...ss?.url ? { screenshot: { url: ss.url, styleMap: ss.styleMap } } : {}
4025
+ ...preferred?.url ? { screenshot: { url: preferred.url, styleMap: preferred.styleMap } } : {}
4005
4026
  };
4006
4027
  } finally {
4007
4028
  release();
@@ -4042,10 +4063,14 @@ var screenshotTool = {
4042
4063
  clearable: true,
4043
4064
  definition: {
4044
4065
  name: "screenshot",
4045
- description: "Capture a full-height screenshot of the app preview and get a description of what's on screen. Captures the settled page state \u2014 it cannot catch animations, transitions, or transient state. Optionally provide specific questions about what you're looking for. Use a bulleted list to ask many questions at once. To ask additional questions about a screenshot you have already captured, pass its URL as imageUrl to skip recapture. If the screenshot requires interaction first (logging in, clicking a tab, dismissing a modal), use the instructions param to describe the steps.",
4066
+ description: "Capture a screenshot of the app preview and get a description of what's on screen. Choose `fullPage`: `false` captures just the visible viewport (fast \u2014 for a specific section the page is scrolled to), `true` captures the entire page top-to-bottom (slower \u2014 for overall composition or content past the fold). Captures the settled page state \u2014 it cannot catch animations, transitions, or transient state. Optionally provide specific questions about what you're looking for. Use a bulleted list to ask many questions at once. To ask additional questions about a screenshot you have already captured, pass its URL as imageUrl to skip recapture. If the screenshot requires interaction first (logging in, clicking a tab, dismissing a modal, scrolling to a section), use the instructions param to describe the steps.",
4046
4067
  inputSchema: {
4047
4068
  type: "object",
4048
4069
  properties: {
4070
+ fullPage: {
4071
+ type: "boolean",
4072
+ description: "true = full-height capture of the entire page; false = just the visible viewport. Pick based on whether you need the whole page or a specific section."
4073
+ },
4049
4074
  prompt: {
4050
4075
  type: "string",
4051
4076
  description: "Optional question about the screenshot. If omitted, returns a general description of what's visible."
@@ -4060,12 +4085,15 @@ var screenshotTool = {
4060
4085
  },
4061
4086
  instructions: {
4062
4087
  type: "string",
4063
- description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions before capturing the screenshot - it can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start its test at. You will always get back a full-height screenshot of the entire page. Do not attempt to scroll or capture specific areas. Never describe what names or values to use when applying the instructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing. Use only when interaction is required to *reach* the state you want to capture \u2014 log in, dismiss a modal, switch a tab, follow a route. If your steps are exercising the app's functionality across multiple states (running flows, asserting behavior under interaction, multi-step QA), use `runAutomatedBrowserTest` instead."
4088
+ description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow, scrolling to a section, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions, then capture per your `fullPage` choice \u2014 so with `fullPage: false` you can scroll to a section and capture just that viewport. It can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start its test at. Never describe what names or values to use when applying the instructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing. Use only when interaction is required to *reach* the state you want to capture \u2014 log in, dismiss a modal, switch a tab, follow a route, scroll to a section. If your steps are exercising the app's functionality across multiple states (running flows, asserting behavior under interaction, multi-step QA), use `runAutomatedBrowserTest` instead."
4064
4089
  }
4065
- }
4090
+ },
4091
+ required: ["fullPage"]
4066
4092
  }
4067
4093
  },
4068
4094
  async execute(input, context) {
4095
+ const fullPage = input.fullPage === true;
4096
+ const shotKind = fullPage ? "full-page" : "viewport";
4069
4097
  try {
4070
4098
  if (input.imageUrl) {
4071
4099
  return await captureAndAnalyzeScreenshot({
@@ -4076,8 +4104,10 @@ var screenshotTool = {
4076
4104
  });
4077
4105
  }
4078
4106
  if (input.instructions && context) {
4079
- const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a full-page screenshot.` : `${input.instructions}. After completing these steps, take a full-page screenshot.`;
4080
- const result = await runBrowserAutomation(task, context);
4107
+ const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a ${shotKind} screenshot.` : `${input.instructions}. After completing these steps, take a ${shotKind} screenshot.`;
4108
+ const result = await runBrowserAutomation(task, context, {
4109
+ capture: fullPage ? "fullPage" : "viewport"
4110
+ });
4081
4111
  if (!result.screenshot) {
4082
4112
  return result.text;
4083
4113
  }
@@ -4094,6 +4124,7 @@ var screenshotTool = {
4094
4124
  return await captureAndAnalyzeScreenshot({
4095
4125
  prompt: input.prompt,
4096
4126
  path: input.path,
4127
+ fullPage,
4097
4128
  onLog: context?.onLog,
4098
4129
  model: resolveModel("imageAnalysis", context?.models, context?.model)
4099
4130
  });
@@ -4393,10 +4424,14 @@ __export(screenshot_exports, {
4393
4424
  var definition5 = {
4394
4425
  clearable: true,
4395
4426
  name: "screenshot",
4396
- description: "Capture a full-height screenshot of the current app preview. Returns a CDN URL along with visual analysis. Use to review the current state of the UI being built. Remember, the screenshot analysis is not overly precise - for example, it cannot reliably identify specific fonts by name \u2014 it can only describe what letterforms look like.",
4427
+ description: "Capture a screenshot of the current app preview and get it back with visual analysis. Choose `fullPage`: `false` captures just the visible viewport (fast \u2014 use it to review a specific section the page is scrolled to), `true` captures the entire page top-to-bottom (slower \u2014 use it to review overall composition or a layout you can't see in one screen). Use to review the current state of the UI being built. Remember, the screenshot analysis is not overly precise - for example, it cannot reliably identify specific fonts by name \u2014 it can only describe what letterforms look like.",
4397
4428
  inputSchema: {
4398
4429
  type: "object",
4399
4430
  properties: {
4431
+ fullPage: {
4432
+ type: "boolean",
4433
+ description: "true = full-height capture of the entire page; false = just the visible viewport. Pick based on whether you need the whole page or a specific section."
4434
+ },
4400
4435
  prompt: {
4401
4436
  type: "string",
4402
4437
  description: "Optional specific question about the screenshot. Use a bulleted list to ask many questions at once."
@@ -4407,16 +4442,21 @@ var definition5 = {
4407
4442
  },
4408
4443
  instructions: {
4409
4444
  type: "string",
4410
- description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions before capturing the screenshot - it can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start its test at. You will always get back a full-height screenshot of the entire page. Do not attempt to scroll or capture specific areas. Only use instructions when you need to trigger stateful changes. Never describe what names or values to use when applying the isntructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing."
4445
+ description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, scrolling to a specific section, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions, then capture per your `fullPage` choice \u2014 so with `fullPage: false` you can scroll to a section and capture just that viewport. It can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start at. Never describe what names or values to use when applying the instructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing."
4411
4446
  }
4412
- }
4447
+ },
4448
+ required: ["fullPage"]
4413
4449
  }
4414
4450
  };
4415
4451
  async function execute5(input, onLog, context) {
4452
+ const fullPage = input.fullPage === true;
4453
+ const shotKind = fullPage ? "full-page" : "viewport";
4416
4454
  if (input.instructions && context) {
4417
4455
  try {
4418
- const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a full-page screenshot.` : `${input.instructions}. After completing these steps, take a full-page screenshot.`;
4419
- const result = await runBrowserAutomation(task, context);
4456
+ const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a ${shotKind} screenshot.` : `${input.instructions}. After completing these steps, take a ${shotKind} screenshot.`;
4457
+ const result = await runBrowserAutomation(task, context, {
4458
+ capture: fullPage ? "fullPage" : "viewport"
4459
+ });
4420
4460
  if (!result.screenshot) {
4421
4461
  return result.text;
4422
4462
  }
@@ -4436,6 +4476,7 @@ async function execute5(input, onLog, context) {
4436
4476
  return await captureAndAnalyzeScreenshot({
4437
4477
  prompt: input.prompt,
4438
4478
  path: input.path,
4479
+ fullPage,
4439
4480
  onLog,
4440
4481
  model: resolveModel("imageAnalysis", context?.models, context?.model)
4441
4482
  });
package/dist/index.js CHANGED
@@ -3247,10 +3247,14 @@ async function captureAndAnalyzeScreenshot(promptOrOptions) {
3247
3247
  let onLog;
3248
3248
  let model;
3249
3249
  let path13;
3250
+ let fullPage = true;
3250
3251
  if (typeof promptOrOptions === "object" && promptOrOptions !== null) {
3251
3252
  prompt = promptOrOptions.prompt;
3252
3253
  existingUrl = promptOrOptions.imageUrl;
3253
3254
  path13 = promptOrOptions.path;
3255
+ if (promptOrOptions.fullPage !== void 0) {
3256
+ fullPage = promptOrOptions.fullPage;
3257
+ }
3254
3258
  onLog = promptOrOptions.onLog;
3255
3259
  model = promptOrOptions.model;
3256
3260
  } else {
@@ -3262,9 +3266,9 @@ async function captureAndAnalyzeScreenshot(promptOrOptions) {
3262
3266
  url = existingUrl;
3263
3267
  } else {
3264
3268
  const ssResult = await sidecarRequest(
3265
- "/screenshot-full-page",
3269
+ fullPage ? "/screenshot-full-page" : "/screenshot-viewport",
3266
3270
  path13 ? { path: path13 } : void 0,
3267
- { timeout: 12e4 }
3271
+ { timeout: fullPage ? 12e4 : 3e4 }
3268
3272
  );
3269
3273
  url = ssResult?.url || ssResult?.screenshotUrl;
3270
3274
  if (!url) {
@@ -4231,6 +4235,20 @@ var init_tools = __esm({
4231
4235
  }
4232
4236
  }
4233
4237
  }
4238
+ },
4239
+ {
4240
+ clearable: true,
4241
+ name: "screenshotViewport",
4242
+ description: "Capture a screenshot of just the visible viewport (no full-page scroll/stitch). Returns a CDN URL with full text analysis and description. Use this when the goal is a specific section the page is currently scrolled to, rather than the whole page.",
4243
+ inputSchema: {
4244
+ type: "object",
4245
+ properties: {
4246
+ path: {
4247
+ type: "string",
4248
+ description: 'Navigate to this path before capturing (e.g. "/settings"). If omitted, screenshots the current page.'
4249
+ }
4250
+ }
4251
+ }
4234
4252
  }
4235
4253
  ];
4236
4254
  BROWSER_EXTERNAL_TOOLS = /* @__PURE__ */ new Set(["browserCommand"]);
@@ -4263,7 +4281,7 @@ var init_prompt2 = __esm({
4263
4281
  });
4264
4282
 
4265
4283
  // src/subagents/browserAutomation/index.ts
4266
- async function runBrowserAutomation(task, context) {
4284
+ async function runBrowserAutomation(task, context, opts) {
4267
4285
  const release = await acquireBrowserLock();
4268
4286
  try {
4269
4287
  const result = await runSubAgent({
@@ -4287,10 +4305,11 @@ async function runBrowserAutomation(task, context) {
4287
4305
  return `Error setting up browser: ${err.message}`;
4288
4306
  }
4289
4307
  }
4290
- if (name === "screenshotFullPage") {
4308
+ if (name === "screenshotFullPage" || name === "screenshotViewport") {
4291
4309
  try {
4292
4310
  return await captureAndAnalyzeScreenshot({
4293
4311
  path: _input.path,
4312
+ fullPage: name === "screenshotFullPage",
4294
4313
  onLog,
4295
4314
  model: resolveModel(
4296
4315
  "imageAnalysis",
@@ -4366,13 +4385,15 @@ async function runBrowserAutomation(task, context) {
4366
4385
  return result2;
4367
4386
  },
4368
4387
  toolRegistry: context.toolRegistry,
4369
- captureArtifacts: ["screenshotFullPage"]
4388
+ captureArtifacts: ["screenshotFullPage", "screenshotViewport"]
4370
4389
  });
4371
4390
  context.subAgentMessages?.set(context.toolCallId, result.messages);
4372
- const ss = result.artifacts?.screenshotFullPage;
4391
+ const viewport = result.artifacts?.screenshotViewport;
4392
+ const fullPage = result.artifacts?.screenshotFullPage;
4393
+ const preferred = opts?.capture === "viewport" ? viewport ?? fullPage : fullPage ?? viewport;
4373
4394
  return {
4374
4395
  text: result.text,
4375
- ...ss?.url ? { screenshot: { url: ss.url, styleMap: ss.styleMap } } : {}
4396
+ ...preferred?.url ? { screenshot: { url: preferred.url, styleMap: preferred.styleMap } } : {}
4376
4397
  };
4377
4398
  } finally {
4378
4399
  release();
@@ -4437,10 +4458,14 @@ var init_screenshot2 = __esm({
4437
4458
  clearable: true,
4438
4459
  definition: {
4439
4460
  name: "screenshot",
4440
- description: "Capture a full-height screenshot of the app preview and get a description of what's on screen. Captures the settled page state \u2014 it cannot catch animations, transitions, or transient state. Optionally provide specific questions about what you're looking for. Use a bulleted list to ask many questions at once. To ask additional questions about a screenshot you have already captured, pass its URL as imageUrl to skip recapture. If the screenshot requires interaction first (logging in, clicking a tab, dismissing a modal), use the instructions param to describe the steps.",
4461
+ description: "Capture a screenshot of the app preview and get a description of what's on screen. Choose `fullPage`: `false` captures just the visible viewport (fast \u2014 for a specific section the page is scrolled to), `true` captures the entire page top-to-bottom (slower \u2014 for overall composition or content past the fold). Captures the settled page state \u2014 it cannot catch animations, transitions, or transient state. Optionally provide specific questions about what you're looking for. Use a bulleted list to ask many questions at once. To ask additional questions about a screenshot you have already captured, pass its URL as imageUrl to skip recapture. If the screenshot requires interaction first (logging in, clicking a tab, dismissing a modal, scrolling to a section), use the instructions param to describe the steps.",
4441
4462
  inputSchema: {
4442
4463
  type: "object",
4443
4464
  properties: {
4465
+ fullPage: {
4466
+ type: "boolean",
4467
+ description: "true = full-height capture of the entire page; false = just the visible viewport. Pick based on whether you need the whole page or a specific section."
4468
+ },
4444
4469
  prompt: {
4445
4470
  type: "string",
4446
4471
  description: "Optional question about the screenshot. If omitted, returns a general description of what's visible."
@@ -4455,12 +4480,15 @@ var init_screenshot2 = __esm({
4455
4480
  },
4456
4481
  instructions: {
4457
4482
  type: "string",
4458
- description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions before capturing the screenshot - it can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start its test at. You will always get back a full-height screenshot of the entire page. Do not attempt to scroll or capture specific areas. Never describe what names or values to use when applying the instructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing. Use only when interaction is required to *reach* the state you want to capture \u2014 log in, dismiss a modal, switch a tab, follow a route. If your steps are exercising the app's functionality across multiple states (running flows, asserting behavior under interaction, multi-step QA), use `runAutomatedBrowserTest` instead."
4483
+ description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow, scrolling to a section, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions, then capture per your `fullPage` choice \u2014 so with `fullPage: false` you can scroll to a section and capture just that viewport. It can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start its test at. Never describe what names or values to use when applying the instructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing. Use only when interaction is required to *reach* the state you want to capture \u2014 log in, dismiss a modal, switch a tab, follow a route, scroll to a section. If your steps are exercising the app's functionality across multiple states (running flows, asserting behavior under interaction, multi-step QA), use `runAutomatedBrowserTest` instead."
4459
4484
  }
4460
- }
4485
+ },
4486
+ required: ["fullPage"]
4461
4487
  }
4462
4488
  },
4463
4489
  async execute(input, context) {
4490
+ const fullPage = input.fullPage === true;
4491
+ const shotKind = fullPage ? "full-page" : "viewport";
4464
4492
  try {
4465
4493
  if (input.imageUrl) {
4466
4494
  return await captureAndAnalyzeScreenshot({
@@ -4471,8 +4499,10 @@ var init_screenshot2 = __esm({
4471
4499
  });
4472
4500
  }
4473
4501
  if (input.instructions && context) {
4474
- const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a full-page screenshot.` : `${input.instructions}. After completing these steps, take a full-page screenshot.`;
4475
- const result = await runBrowserAutomation(task, context);
4502
+ const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a ${shotKind} screenshot.` : `${input.instructions}. After completing these steps, take a ${shotKind} screenshot.`;
4503
+ const result = await runBrowserAutomation(task, context, {
4504
+ capture: fullPage ? "fullPage" : "viewport"
4505
+ });
4476
4506
  if (!result.screenshot) {
4477
4507
  return result.text;
4478
4508
  }
@@ -4489,6 +4519,7 @@ var init_screenshot2 = __esm({
4489
4519
  return await captureAndAnalyzeScreenshot({
4490
4520
  prompt: input.prompt,
4491
4521
  path: input.path,
4522
+ fullPage,
4492
4523
  onLog: context?.onLog,
4493
4524
  model: resolveModel("imageAnalysis", context?.models, context?.model)
4494
4525
  });
@@ -4826,10 +4857,14 @@ __export(screenshot_exports, {
4826
4857
  execute: () => execute5
4827
4858
  });
4828
4859
  async function execute5(input, onLog, context) {
4860
+ const fullPage = input.fullPage === true;
4861
+ const shotKind = fullPage ? "full-page" : "viewport";
4829
4862
  if (input.instructions && context) {
4830
4863
  try {
4831
- const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a full-page screenshot.` : `${input.instructions}. After completing these steps, take a full-page screenshot.`;
4832
- const result = await runBrowserAutomation(task, context);
4864
+ const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a ${shotKind} screenshot.` : `${input.instructions}. After completing these steps, take a ${shotKind} screenshot.`;
4865
+ const result = await runBrowserAutomation(task, context, {
4866
+ capture: fullPage ? "fullPage" : "viewport"
4867
+ });
4833
4868
  if (!result.screenshot) {
4834
4869
  return result.text;
4835
4870
  }
@@ -4849,6 +4884,7 @@ async function execute5(input, onLog, context) {
4849
4884
  return await captureAndAnalyzeScreenshot({
4850
4885
  prompt: input.prompt,
4851
4886
  path: input.path,
4887
+ fullPage,
4852
4888
  onLog,
4853
4889
  model: resolveModel("imageAnalysis", context?.models, context?.model)
4854
4890
  });
@@ -4869,10 +4905,14 @@ var init_screenshot3 = __esm({
4869
4905
  definition5 = {
4870
4906
  clearable: true,
4871
4907
  name: "screenshot",
4872
- description: "Capture a full-height screenshot of the current app preview. Returns a CDN URL along with visual analysis. Use to review the current state of the UI being built. Remember, the screenshot analysis is not overly precise - for example, it cannot reliably identify specific fonts by name \u2014 it can only describe what letterforms look like.",
4908
+ description: "Capture a screenshot of the current app preview and get it back with visual analysis. Choose `fullPage`: `false` captures just the visible viewport (fast \u2014 use it to review a specific section the page is scrolled to), `true` captures the entire page top-to-bottom (slower \u2014 use it to review overall composition or a layout you can't see in one screen). Use to review the current state of the UI being built. Remember, the screenshot analysis is not overly precise - for example, it cannot reliably identify specific fonts by name \u2014 it can only describe what letterforms look like.",
4873
4909
  inputSchema: {
4874
4910
  type: "object",
4875
4911
  properties: {
4912
+ fullPage: {
4913
+ type: "boolean",
4914
+ description: "true = full-height capture of the entire page; false = just the visible viewport. Pick based on whether you need the whole page or a specific section."
4915
+ },
4876
4916
  prompt: {
4877
4917
  type: "string",
4878
4918
  description: "Optional specific question about the screenshot. Use a bulleted list to ask many questions at once."
@@ -4883,9 +4923,10 @@ var init_screenshot3 = __esm({
4883
4923
  },
4884
4924
  instructions: {
4885
4925
  type: "string",
4886
- description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions before capturing the screenshot - it can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start its test at. You will always get back a full-height screenshot of the entire page. Do not attempt to scroll or capture specific areas. Only use instructions when you need to trigger stateful changes. Never describe what names or values to use when applying the isntructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing."
4926
+ description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, scrolling to a specific section, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions, then capture per your `fullPage` choice \u2014 so with `fullPage: false` you can scroll to a section and capture just that viewport. It can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start at. Never describe what names or values to use when applying the instructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing."
4887
4927
  }
4888
- }
4928
+ },
4929
+ required: ["fullPage"]
4889
4930
  }
4890
4931
  };
4891
4932
  }
@@ -9,12 +9,12 @@
9
9
  ### Verification
10
10
  Run `lspDiagnostics` after every turn where you have edited code in any meaningful way. You don't need to run it for things like changing copy or CSS colors, but you should run it after any structural changes to code. It catches syntax errors, broken imports, and type mismatches instantly. After a big build or significant changes, also do a lightweight runtime check to catch the things static analysis misses (schema mismatches, missing imports, bad queries). Your runtime check can include:
11
11
  - Spot-checking methods with `runMethod`. The dev database is a disposable snapshot that will have been seeded with scenario data, so don't worry about being destructive.
12
- - For frontend work, taking a `screenshot` to confirm the main view renders correctly or look at the browser log for any console errors in the user's preview.
12
+ - For frontend work, checking the browser log for any console errors in the user's preview, and when a change's visual outcome is genuinely uncertain taking a `screenshot` to confirm the main view renders correctly.
13
13
  - Using `runAutomatedBrowserTest` to verify an interactive flow that you can't confirm from a screenshot, when the user reports something broken that you can't identify from code alone, or whenever the verification involves driving the app through multiple interactions.
14
14
 
15
15
  Aim for confidence that the core happy paths work. If the 80% case is solid, the remaining edge cases are likely fine and the user can surface them in chat. Don't screenshot every page, test every permutation, or verify every secondary flow. One or two runtime checks that confirm the app loads and data flows through is enough.
16
16
 
17
- When making mechanical edits as part of iterating with the user (e.g., moving elements, changing labels, small redesigns and refactors), don't screenshot to confirm, simply trust your code. Re-screenshot only when changes are structural enough that the visual outcome is genuinely uncertain (new layout, new component composition, new route), or when the user reports something visible that you can't see in the code. The screenshot tool captures static/settled state - don't try to hack it with different instructions to capture transient states or animations or things like that. If what you need is not avaialble via screenshot, fall back to static analysis by tracing code.
17
+ Default to trusting your code. The test is whether you can predict the rendered result from the diff: for copy changes, color and spacing tweaks, swapping classes, and most style edits, you can the diff already tells you the outcome, so don't screenshot. Reach for a `screenshot` only when the visual result is genuinely uncertain and you can't trace it from the code (a new layout, a new component composition, a new route), or when the user reports something visible that you can't see in the code. And when you're iterating live with the user on a page they're previewing, the user is your viewport — make the edit and let them react, rather than confirming what they can already see. The screenshot tool captures static/settled state - don't try to hack it with different instructions to capture transient states or animations or things like that. If what you need is not available via screenshot, fall back to static analysis by tracing code.
18
18
 
19
19
  ### Process Logs
20
20
  Process logs are available at .logs/ in NDJSON format (one JSON object per line) for debugging. Each line has at minimum ts (unix millis) and msg fields, plus structured context like level, module, requestId, toolCallId where available. You can use `jq` to examine logs and debug failures. Tools like run method or run scenario execute synchronously, so log data will be available by the time those tools return their results to you, there is no need to `sleep` before querying logfiles.
@@ -139,8 +139,8 @@ Check a count with evaluate:
139
139
  ```
140
140
  </examples>
141
141
 
142
- ### Full Page Screenshot
143
- You can use the `screenshotFullPage` tool to take a full-height screenshot of the current page. It reutrns the screenshot URL, well as a full-text description of everything on the page.
142
+ ### Final Screenshot
143
+ You can use the `screenshotFullPage` tool to take a full-height screenshot of the current page, or the `screenshotViewport` tool to capture just the visible viewport (faster, and the right choice when the task is about a specific section you've scrolled to). Both return the screenshot URL plus a full-text description. If the task asked for a viewport/section view, end with `screenshotViewport`; if it asked for the whole page, end with `screenshotFullPage`.
144
144
 
145
145
  <rules>
146
146
  - Always batch steps into a single browserCommand call. Don't send one step per turn. Type + click + wait should be one call, not three separate turns.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mindstudio-ai/remy",
3
- "version": "0.1.191",
3
+ "version": "0.1.193",
4
4
  "description": "MindStudio coding agent",
5
5
  "repository": {
6
6
  "type": "git",