@mindstudio-ai/remy 0.1.192 → 0.1.194
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/headless.js +57 -16
- package/dist/index.js +57 -16
- package/dist/subagents/browserAutomation/prompt.md +14 -3
- package/package.json +1 -1
package/dist/headless.js
CHANGED
|
@@ -2846,10 +2846,14 @@ async function captureAndAnalyzeScreenshot(promptOrOptions) {
|
|
|
2846
2846
|
let onLog;
|
|
2847
2847
|
let model;
|
|
2848
2848
|
let path12;
|
|
2849
|
+
let fullPage = true;
|
|
2849
2850
|
if (typeof promptOrOptions === "object" && promptOrOptions !== null) {
|
|
2850
2851
|
prompt = promptOrOptions.prompt;
|
|
2851
2852
|
existingUrl = promptOrOptions.imageUrl;
|
|
2852
2853
|
path12 = promptOrOptions.path;
|
|
2854
|
+
if (promptOrOptions.fullPage !== void 0) {
|
|
2855
|
+
fullPage = promptOrOptions.fullPage;
|
|
2856
|
+
}
|
|
2853
2857
|
onLog = promptOrOptions.onLog;
|
|
2854
2858
|
model = promptOrOptions.model;
|
|
2855
2859
|
} else {
|
|
@@ -2861,9 +2865,9 @@ async function captureAndAnalyzeScreenshot(promptOrOptions) {
|
|
|
2861
2865
|
url = existingUrl;
|
|
2862
2866
|
} else {
|
|
2863
2867
|
const ssResult = await sidecarRequest(
|
|
2864
|
-
"/screenshot-full-page",
|
|
2868
|
+
fullPage ? "/screenshot-full-page" : "/screenshot-viewport",
|
|
2865
2869
|
path12 ? { path: path12 } : void 0,
|
|
2866
|
-
{ timeout: 12e4 }
|
|
2870
|
+
{ timeout: fullPage ? 12e4 : 3e4 }
|
|
2867
2871
|
);
|
|
2868
2872
|
url = ssResult?.url || ssResult?.screenshotUrl;
|
|
2869
2873
|
if (!url) {
|
|
@@ -3705,7 +3709,7 @@ var BROWSER_TOOLS = [
|
|
|
3705
3709
|
"screenshotFullPage",
|
|
3706
3710
|
"screenshotViewport"
|
|
3707
3711
|
],
|
|
3708
|
-
description: "snapshot: accessibility tree of the page (waits for network to settle). click: click an element (animated cursor, full event sequence). type: type text into input (one char at a time, works with React/Vue/Svelte). select: select a dropdown option by text. wait: wait for an element to appear (polls 100ms, waits for network). navigate: navigate to a URL within the app (waits for load, subsequent steps run on new page). evaluate: run JS in the page. styles: read computed CSS styles from elements (pass properties array with camelCase names, or omit for defaults). screenshotFullPage: full-page viewport-stitched screenshot (returns CDN url with dimensions). screenshotViewport: screenshot of just the visible viewport."
|
|
3712
|
+
description: "snapshot: accessibility tree of the page (waits for network to settle). click: click an element (animated cursor, full event sequence). type: type text into input (one char at a time, works with React/Vue/Svelte). select: select a dropdown option by text. wait: wait for an element to appear (polls 100ms, waits for network). navigate: navigate to a URL within the app (waits for load, subsequent steps run on new page). evaluate: run JS in the page. styles: read computed CSS styles from elements (pass properties array with camelCase names, or omit for defaults). screenshotFullPage: full-page viewport-stitched screenshot (returns CDN url with dimensions). screenshotViewport: screenshot of just the visible viewport \u2014 pass `scrollToSelector` (or `scrollY`) on this step to scroll a section into view and capture it in one atomic step (no separate scroll needed)."
|
|
3709
3713
|
},
|
|
3710
3714
|
ref: {
|
|
3711
3715
|
type: "string",
|
|
@@ -3751,6 +3755,14 @@ var BROWSER_TOOLS = [
|
|
|
3751
3755
|
type: "array",
|
|
3752
3756
|
items: { type: "string" },
|
|
3753
3757
|
description: 'For styles: camelCase CSS property names to read (e.g., ["backgroundColor", "borderRadius", "fontSize"]). Omit for a default set.'
|
|
3758
|
+
},
|
|
3759
|
+
scrollToSelector: {
|
|
3760
|
+
type: "string",
|
|
3761
|
+
description: "For screenshotViewport: a CSS selector to scroll into view (via the capture\u2019s own context) immediately before the shot, so scroll + capture are atomic. Prefer this over a separate evaluate-scroll step when capturing a specific section."
|
|
3762
|
+
},
|
|
3763
|
+
scrollY: {
|
|
3764
|
+
type: "number",
|
|
3765
|
+
description: "For screenshotViewport: absolute Y offset to scroll to before the shot, when no selector is available."
|
|
3754
3766
|
}
|
|
3755
3767
|
},
|
|
3756
3768
|
required: ["command"]
|
|
@@ -3892,9 +3904,10 @@ function resolveModel(surfaceId, models, fallback) {
|
|
|
3892
3904
|
|
|
3893
3905
|
// src/subagents/browserAutomation/index.ts
|
|
3894
3906
|
var log7 = createLogger("browser-automation");
|
|
3895
|
-
async function runBrowserAutomation(task, context) {
|
|
3907
|
+
async function runBrowserAutomation(task, context, opts) {
|
|
3896
3908
|
const release = await acquireBrowserLock();
|
|
3897
3909
|
try {
|
|
3910
|
+
let lastBrowserCommandViewport;
|
|
3898
3911
|
const result = await runSubAgent({
|
|
3899
3912
|
system: getBrowserAutomationPrompt(),
|
|
3900
3913
|
task,
|
|
@@ -3920,6 +3933,7 @@ async function runBrowserAutomation(task, context) {
|
|
|
3920
3933
|
try {
|
|
3921
3934
|
return await captureAndAnalyzeScreenshot({
|
|
3922
3935
|
path: _input.path,
|
|
3936
|
+
fullPage: true,
|
|
3923
3937
|
onLog,
|
|
3924
3938
|
model: resolveModel(
|
|
3925
3939
|
"imageAnalysis",
|
|
@@ -3952,6 +3966,11 @@ async function runBrowserAutomation(task, context) {
|
|
|
3952
3966
|
(s) => s.command === "screenshotViewport" && s.result?.url
|
|
3953
3967
|
);
|
|
3954
3968
|
if (screenshotSteps.length > 0) {
|
|
3969
|
+
const lastStep = screenshotSteps[screenshotSteps.length - 1];
|
|
3970
|
+
lastBrowserCommandViewport = {
|
|
3971
|
+
url: lastStep.result.url,
|
|
3972
|
+
styleMap: lastStep.result.styleMap
|
|
3973
|
+
};
|
|
3955
3974
|
const visionOverride = {
|
|
3956
3975
|
model: resolveModel(
|
|
3957
3976
|
"imageAnalysis",
|
|
@@ -3998,10 +4017,12 @@ async function runBrowserAutomation(task, context) {
|
|
|
3998
4017
|
captureArtifacts: ["screenshotFullPage"]
|
|
3999
4018
|
});
|
|
4000
4019
|
context.subAgentMessages?.set(context.toolCallId, result.messages);
|
|
4001
|
-
const
|
|
4020
|
+
const fullPage = result.artifacts?.screenshotFullPage;
|
|
4021
|
+
const viewport = lastBrowserCommandViewport;
|
|
4022
|
+
const preferred = opts?.capture === "viewport" ? viewport ?? fullPage : fullPage ?? viewport;
|
|
4002
4023
|
return {
|
|
4003
4024
|
text: result.text,
|
|
4004
|
-
...
|
|
4025
|
+
...preferred?.url ? { screenshot: { url: preferred.url, styleMap: preferred.styleMap } } : {}
|
|
4005
4026
|
};
|
|
4006
4027
|
} finally {
|
|
4007
4028
|
release();
|
|
@@ -4042,10 +4063,14 @@ var screenshotTool = {
|
|
|
4042
4063
|
clearable: true,
|
|
4043
4064
|
definition: {
|
|
4044
4065
|
name: "screenshot",
|
|
4045
|
-
description: "Capture a
|
|
4066
|
+
description: "Capture a screenshot of the app preview and get a description of what's on screen. Choose `fullPage`: `false` captures just the visible viewport (fast \u2014 for a specific section the page is scrolled to), `true` captures the entire page top-to-bottom (slower \u2014 for overall composition or content past the fold). Captures the settled page state \u2014 it cannot catch animations, transitions, or transient state. Optionally provide specific questions about what you're looking for. Use a bulleted list to ask many questions at once. To ask additional questions about a screenshot you have already captured, pass its URL as imageUrl to skip recapture. If the screenshot requires interaction first (logging in, clicking a tab, dismissing a modal, scrolling to a section), use the instructions param to describe the steps.",
|
|
4046
4067
|
inputSchema: {
|
|
4047
4068
|
type: "object",
|
|
4048
4069
|
properties: {
|
|
4070
|
+
fullPage: {
|
|
4071
|
+
type: "boolean",
|
|
4072
|
+
description: "true = full-height capture of the entire page; false = just the visible viewport. Pick based on whether you need the whole page or a specific section."
|
|
4073
|
+
},
|
|
4049
4074
|
prompt: {
|
|
4050
4075
|
type: "string",
|
|
4051
4076
|
description: "Optional question about the screenshot. If omitted, returns a general description of what's visible."
|
|
@@ -4060,12 +4085,15 @@ var screenshotTool = {
|
|
|
4060
4085
|
},
|
|
4061
4086
|
instructions: {
|
|
4062
4087
|
type: "string",
|
|
4063
|
-
description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions
|
|
4088
|
+
description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow, scrolling to a section, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions, then capture per your `fullPage` choice \u2014 so with `fullPage: false` you can scroll to a section and capture just that viewport. It can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start its test at. Never describe what names or values to use when applying the instructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing. Use only when interaction is required to *reach* the state you want to capture \u2014 log in, dismiss a modal, switch a tab, follow a route, scroll to a section. If your steps are exercising the app's functionality across multiple states (running flows, asserting behavior under interaction, multi-step QA), use `runAutomatedBrowserTest` instead."
|
|
4064
4089
|
}
|
|
4065
|
-
}
|
|
4090
|
+
},
|
|
4091
|
+
required: ["fullPage"]
|
|
4066
4092
|
}
|
|
4067
4093
|
},
|
|
4068
4094
|
async execute(input, context) {
|
|
4095
|
+
const fullPage = input.fullPage === true;
|
|
4096
|
+
const shotKind = fullPage ? "full-page" : "viewport";
|
|
4069
4097
|
try {
|
|
4070
4098
|
if (input.imageUrl) {
|
|
4071
4099
|
return await captureAndAnalyzeScreenshot({
|
|
@@ -4076,8 +4104,10 @@ var screenshotTool = {
|
|
|
4076
4104
|
});
|
|
4077
4105
|
}
|
|
4078
4106
|
if (input.instructions && context) {
|
|
4079
|
-
const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a
|
|
4080
|
-
const result = await runBrowserAutomation(task, context
|
|
4107
|
+
const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a ${shotKind} screenshot.` : `${input.instructions}. After completing these steps, take a ${shotKind} screenshot.`;
|
|
4108
|
+
const result = await runBrowserAutomation(task, context, {
|
|
4109
|
+
capture: fullPage ? "fullPage" : "viewport"
|
|
4110
|
+
});
|
|
4081
4111
|
if (!result.screenshot) {
|
|
4082
4112
|
return result.text;
|
|
4083
4113
|
}
|
|
@@ -4094,6 +4124,7 @@ var screenshotTool = {
|
|
|
4094
4124
|
return await captureAndAnalyzeScreenshot({
|
|
4095
4125
|
prompt: input.prompt,
|
|
4096
4126
|
path: input.path,
|
|
4127
|
+
fullPage,
|
|
4097
4128
|
onLog: context?.onLog,
|
|
4098
4129
|
model: resolveModel("imageAnalysis", context?.models, context?.model)
|
|
4099
4130
|
});
|
|
@@ -4393,10 +4424,14 @@ __export(screenshot_exports, {
|
|
|
4393
4424
|
var definition5 = {
|
|
4394
4425
|
clearable: true,
|
|
4395
4426
|
name: "screenshot",
|
|
4396
|
-
description: "Capture a
|
|
4427
|
+
description: "Capture a screenshot of the current app preview and get it back with visual analysis. Choose `fullPage`: `false` captures just the visible viewport (fast \u2014 use it to review a specific section the page is scrolled to), `true` captures the entire page top-to-bottom (slower \u2014 use it to review overall composition or a layout you can't see in one screen). Use to review the current state of the UI being built. Remember, the screenshot analysis is not overly precise - for example, it cannot reliably identify specific fonts by name \u2014 it can only describe what letterforms look like.",
|
|
4397
4428
|
inputSchema: {
|
|
4398
4429
|
type: "object",
|
|
4399
4430
|
properties: {
|
|
4431
|
+
fullPage: {
|
|
4432
|
+
type: "boolean",
|
|
4433
|
+
description: "true = full-height capture of the entire page; false = just the visible viewport. Pick based on whether you need the whole page or a specific section."
|
|
4434
|
+
},
|
|
4400
4435
|
prompt: {
|
|
4401
4436
|
type: "string",
|
|
4402
4437
|
description: "Optional specific question about the screenshot. Use a bulleted list to ask many questions at once."
|
|
@@ -4407,16 +4442,21 @@ var definition5 = {
|
|
|
4407
4442
|
},
|
|
4408
4443
|
instructions: {
|
|
4409
4444
|
type: "string",
|
|
4410
|
-
description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form,
|
|
4445
|
+
description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, scrolling to a specific section, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions, then capture per your `fullPage` choice \u2014 so with `fullPage: false` you can scroll to a section and capture just that viewport. It can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start at. Never describe what names or values to use when applying the instructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing."
|
|
4411
4446
|
}
|
|
4412
|
-
}
|
|
4447
|
+
},
|
|
4448
|
+
required: ["fullPage"]
|
|
4413
4449
|
}
|
|
4414
4450
|
};
|
|
4415
4451
|
async function execute5(input, onLog, context) {
|
|
4452
|
+
const fullPage = input.fullPage === true;
|
|
4453
|
+
const shotKind = fullPage ? "full-page" : "viewport";
|
|
4416
4454
|
if (input.instructions && context) {
|
|
4417
4455
|
try {
|
|
4418
|
-
const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a
|
|
4419
|
-
const result = await runBrowserAutomation(task, context
|
|
4456
|
+
const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a ${shotKind} screenshot.` : `${input.instructions}. After completing these steps, take a ${shotKind} screenshot.`;
|
|
4457
|
+
const result = await runBrowserAutomation(task, context, {
|
|
4458
|
+
capture: fullPage ? "fullPage" : "viewport"
|
|
4459
|
+
});
|
|
4420
4460
|
if (!result.screenshot) {
|
|
4421
4461
|
return result.text;
|
|
4422
4462
|
}
|
|
@@ -4436,6 +4476,7 @@ async function execute5(input, onLog, context) {
|
|
|
4436
4476
|
return await captureAndAnalyzeScreenshot({
|
|
4437
4477
|
prompt: input.prompt,
|
|
4438
4478
|
path: input.path,
|
|
4479
|
+
fullPage,
|
|
4439
4480
|
onLog,
|
|
4440
4481
|
model: resolveModel("imageAnalysis", context?.models, context?.model)
|
|
4441
4482
|
});
|
package/dist/index.js
CHANGED
|
@@ -3247,10 +3247,14 @@ async function captureAndAnalyzeScreenshot(promptOrOptions) {
|
|
|
3247
3247
|
let onLog;
|
|
3248
3248
|
let model;
|
|
3249
3249
|
let path13;
|
|
3250
|
+
let fullPage = true;
|
|
3250
3251
|
if (typeof promptOrOptions === "object" && promptOrOptions !== null) {
|
|
3251
3252
|
prompt = promptOrOptions.prompt;
|
|
3252
3253
|
existingUrl = promptOrOptions.imageUrl;
|
|
3253
3254
|
path13 = promptOrOptions.path;
|
|
3255
|
+
if (promptOrOptions.fullPage !== void 0) {
|
|
3256
|
+
fullPage = promptOrOptions.fullPage;
|
|
3257
|
+
}
|
|
3254
3258
|
onLog = promptOrOptions.onLog;
|
|
3255
3259
|
model = promptOrOptions.model;
|
|
3256
3260
|
} else {
|
|
@@ -3262,9 +3266,9 @@ async function captureAndAnalyzeScreenshot(promptOrOptions) {
|
|
|
3262
3266
|
url = existingUrl;
|
|
3263
3267
|
} else {
|
|
3264
3268
|
const ssResult = await sidecarRequest(
|
|
3265
|
-
"/screenshot-full-page",
|
|
3269
|
+
fullPage ? "/screenshot-full-page" : "/screenshot-viewport",
|
|
3266
3270
|
path13 ? { path: path13 } : void 0,
|
|
3267
|
-
{ timeout: 12e4 }
|
|
3271
|
+
{ timeout: fullPage ? 12e4 : 3e4 }
|
|
3268
3272
|
);
|
|
3269
3273
|
url = ssResult?.url || ssResult?.screenshotUrl;
|
|
3270
3274
|
if (!url) {
|
|
@@ -4163,7 +4167,7 @@ var init_tools = __esm({
|
|
|
4163
4167
|
"screenshotFullPage",
|
|
4164
4168
|
"screenshotViewport"
|
|
4165
4169
|
],
|
|
4166
|
-
description: "snapshot: accessibility tree of the page (waits for network to settle). click: click an element (animated cursor, full event sequence). type: type text into input (one char at a time, works with React/Vue/Svelte). select: select a dropdown option by text. wait: wait for an element to appear (polls 100ms, waits for network). navigate: navigate to a URL within the app (waits for load, subsequent steps run on new page). evaluate: run JS in the page. styles: read computed CSS styles from elements (pass properties array with camelCase names, or omit for defaults). screenshotFullPage: full-page viewport-stitched screenshot (returns CDN url with dimensions). screenshotViewport: screenshot of just the visible viewport."
|
|
4170
|
+
description: "snapshot: accessibility tree of the page (waits for network to settle). click: click an element (animated cursor, full event sequence). type: type text into input (one char at a time, works with React/Vue/Svelte). select: select a dropdown option by text. wait: wait for an element to appear (polls 100ms, waits for network). navigate: navigate to a URL within the app (waits for load, subsequent steps run on new page). evaluate: run JS in the page. styles: read computed CSS styles from elements (pass properties array with camelCase names, or omit for defaults). screenshotFullPage: full-page viewport-stitched screenshot (returns CDN url with dimensions). screenshotViewport: screenshot of just the visible viewport \u2014 pass `scrollToSelector` (or `scrollY`) on this step to scroll a section into view and capture it in one atomic step (no separate scroll needed)."
|
|
4167
4171
|
},
|
|
4168
4172
|
ref: {
|
|
4169
4173
|
type: "string",
|
|
@@ -4209,6 +4213,14 @@ var init_tools = __esm({
|
|
|
4209
4213
|
type: "array",
|
|
4210
4214
|
items: { type: "string" },
|
|
4211
4215
|
description: 'For styles: camelCase CSS property names to read (e.g., ["backgroundColor", "borderRadius", "fontSize"]). Omit for a default set.'
|
|
4216
|
+
},
|
|
4217
|
+
scrollToSelector: {
|
|
4218
|
+
type: "string",
|
|
4219
|
+
description: "For screenshotViewport: a CSS selector to scroll into view (via the capture\u2019s own context) immediately before the shot, so scroll + capture are atomic. Prefer this over a separate evaluate-scroll step when capturing a specific section."
|
|
4220
|
+
},
|
|
4221
|
+
scrollY: {
|
|
4222
|
+
type: "number",
|
|
4223
|
+
description: "For screenshotViewport: absolute Y offset to scroll to before the shot, when no selector is available."
|
|
4212
4224
|
}
|
|
4213
4225
|
},
|
|
4214
4226
|
required: ["command"]
|
|
@@ -4263,9 +4275,10 @@ var init_prompt2 = __esm({
|
|
|
4263
4275
|
});
|
|
4264
4276
|
|
|
4265
4277
|
// src/subagents/browserAutomation/index.ts
|
|
4266
|
-
async function runBrowserAutomation(task, context) {
|
|
4278
|
+
async function runBrowserAutomation(task, context, opts) {
|
|
4267
4279
|
const release = await acquireBrowserLock();
|
|
4268
4280
|
try {
|
|
4281
|
+
let lastBrowserCommandViewport;
|
|
4269
4282
|
const result = await runSubAgent({
|
|
4270
4283
|
system: getBrowserAutomationPrompt(),
|
|
4271
4284
|
task,
|
|
@@ -4291,6 +4304,7 @@ async function runBrowserAutomation(task, context) {
|
|
|
4291
4304
|
try {
|
|
4292
4305
|
return await captureAndAnalyzeScreenshot({
|
|
4293
4306
|
path: _input.path,
|
|
4307
|
+
fullPage: true,
|
|
4294
4308
|
onLog,
|
|
4295
4309
|
model: resolveModel(
|
|
4296
4310
|
"imageAnalysis",
|
|
@@ -4323,6 +4337,11 @@ async function runBrowserAutomation(task, context) {
|
|
|
4323
4337
|
(s) => s.command === "screenshotViewport" && s.result?.url
|
|
4324
4338
|
);
|
|
4325
4339
|
if (screenshotSteps.length > 0) {
|
|
4340
|
+
const lastStep = screenshotSteps[screenshotSteps.length - 1];
|
|
4341
|
+
lastBrowserCommandViewport = {
|
|
4342
|
+
url: lastStep.result.url,
|
|
4343
|
+
styleMap: lastStep.result.styleMap
|
|
4344
|
+
};
|
|
4326
4345
|
const visionOverride = {
|
|
4327
4346
|
model: resolveModel(
|
|
4328
4347
|
"imageAnalysis",
|
|
@@ -4369,10 +4388,12 @@ async function runBrowserAutomation(task, context) {
|
|
|
4369
4388
|
captureArtifacts: ["screenshotFullPage"]
|
|
4370
4389
|
});
|
|
4371
4390
|
context.subAgentMessages?.set(context.toolCallId, result.messages);
|
|
4372
|
-
const
|
|
4391
|
+
const fullPage = result.artifacts?.screenshotFullPage;
|
|
4392
|
+
const viewport = lastBrowserCommandViewport;
|
|
4393
|
+
const preferred = opts?.capture === "viewport" ? viewport ?? fullPage : fullPage ?? viewport;
|
|
4373
4394
|
return {
|
|
4374
4395
|
text: result.text,
|
|
4375
|
-
...
|
|
4396
|
+
...preferred?.url ? { screenshot: { url: preferred.url, styleMap: preferred.styleMap } } : {}
|
|
4376
4397
|
};
|
|
4377
4398
|
} finally {
|
|
4378
4399
|
release();
|
|
@@ -4437,10 +4458,14 @@ var init_screenshot2 = __esm({
|
|
|
4437
4458
|
clearable: true,
|
|
4438
4459
|
definition: {
|
|
4439
4460
|
name: "screenshot",
|
|
4440
|
-
description: "Capture a
|
|
4461
|
+
description: "Capture a screenshot of the app preview and get a description of what's on screen. Choose `fullPage`: `false` captures just the visible viewport (fast \u2014 for a specific section the page is scrolled to), `true` captures the entire page top-to-bottom (slower \u2014 for overall composition or content past the fold). Captures the settled page state \u2014 it cannot catch animations, transitions, or transient state. Optionally provide specific questions about what you're looking for. Use a bulleted list to ask many questions at once. To ask additional questions about a screenshot you have already captured, pass its URL as imageUrl to skip recapture. If the screenshot requires interaction first (logging in, clicking a tab, dismissing a modal, scrolling to a section), use the instructions param to describe the steps.",
|
|
4441
4462
|
inputSchema: {
|
|
4442
4463
|
type: "object",
|
|
4443
4464
|
properties: {
|
|
4465
|
+
fullPage: {
|
|
4466
|
+
type: "boolean",
|
|
4467
|
+
description: "true = full-height capture of the entire page; false = just the visible viewport. Pick based on whether you need the whole page or a specific section."
|
|
4468
|
+
},
|
|
4444
4469
|
prompt: {
|
|
4445
4470
|
type: "string",
|
|
4446
4471
|
description: "Optional question about the screenshot. If omitted, returns a general description of what's visible."
|
|
@@ -4455,12 +4480,15 @@ var init_screenshot2 = __esm({
|
|
|
4455
4480
|
},
|
|
4456
4481
|
instructions: {
|
|
4457
4482
|
type: "string",
|
|
4458
|
-
description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions
|
|
4483
|
+
description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, navigating a flow, scrolling to a section, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions, then capture per your `fullPage` choice \u2014 so with `fullPage: false` you can scroll to a section and capture just that viewport. It can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start its test at. Never describe what names or values to use when applying the instructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing. Use only when interaction is required to *reach* the state you want to capture \u2014 log in, dismiss a modal, switch a tab, follow a route, scroll to a section. If your steps are exercising the app's functionality across multiple states (running flows, asserting behavior under interaction, multi-step QA), use `runAutomatedBrowserTest` instead."
|
|
4459
4484
|
}
|
|
4460
|
-
}
|
|
4485
|
+
},
|
|
4486
|
+
required: ["fullPage"]
|
|
4461
4487
|
}
|
|
4462
4488
|
},
|
|
4463
4489
|
async execute(input, context) {
|
|
4490
|
+
const fullPage = input.fullPage === true;
|
|
4491
|
+
const shotKind = fullPage ? "full-page" : "viewport";
|
|
4464
4492
|
try {
|
|
4465
4493
|
if (input.imageUrl) {
|
|
4466
4494
|
return await captureAndAnalyzeScreenshot({
|
|
@@ -4471,8 +4499,10 @@ var init_screenshot2 = __esm({
|
|
|
4471
4499
|
});
|
|
4472
4500
|
}
|
|
4473
4501
|
if (input.instructions && context) {
|
|
4474
|
-
const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a
|
|
4475
|
-
const result = await runBrowserAutomation(task, context
|
|
4502
|
+
const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a ${shotKind} screenshot.` : `${input.instructions}. After completing these steps, take a ${shotKind} screenshot.`;
|
|
4503
|
+
const result = await runBrowserAutomation(task, context, {
|
|
4504
|
+
capture: fullPage ? "fullPage" : "viewport"
|
|
4505
|
+
});
|
|
4476
4506
|
if (!result.screenshot) {
|
|
4477
4507
|
return result.text;
|
|
4478
4508
|
}
|
|
@@ -4489,6 +4519,7 @@ var init_screenshot2 = __esm({
|
|
|
4489
4519
|
return await captureAndAnalyzeScreenshot({
|
|
4490
4520
|
prompt: input.prompt,
|
|
4491
4521
|
path: input.path,
|
|
4522
|
+
fullPage,
|
|
4492
4523
|
onLog: context?.onLog,
|
|
4493
4524
|
model: resolveModel("imageAnalysis", context?.models, context?.model)
|
|
4494
4525
|
});
|
|
@@ -4826,10 +4857,14 @@ __export(screenshot_exports, {
|
|
|
4826
4857
|
execute: () => execute5
|
|
4827
4858
|
});
|
|
4828
4859
|
async function execute5(input, onLog, context) {
|
|
4860
|
+
const fullPage = input.fullPage === true;
|
|
4861
|
+
const shotKind = fullPage ? "full-page" : "viewport";
|
|
4829
4862
|
if (input.instructions && context) {
|
|
4830
4863
|
try {
|
|
4831
|
-
const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a
|
|
4832
|
-
const result = await runBrowserAutomation(task, context
|
|
4864
|
+
const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a ${shotKind} screenshot.` : `${input.instructions}. After completing these steps, take a ${shotKind} screenshot.`;
|
|
4865
|
+
const result = await runBrowserAutomation(task, context, {
|
|
4866
|
+
capture: fullPage ? "fullPage" : "viewport"
|
|
4867
|
+
});
|
|
4833
4868
|
if (!result.screenshot) {
|
|
4834
4869
|
return result.text;
|
|
4835
4870
|
}
|
|
@@ -4849,6 +4884,7 @@ async function execute5(input, onLog, context) {
|
|
|
4849
4884
|
return await captureAndAnalyzeScreenshot({
|
|
4850
4885
|
prompt: input.prompt,
|
|
4851
4886
|
path: input.path,
|
|
4887
|
+
fullPage,
|
|
4852
4888
|
onLog,
|
|
4853
4889
|
model: resolveModel("imageAnalysis", context?.models, context?.model)
|
|
4854
4890
|
});
|
|
@@ -4869,10 +4905,14 @@ var init_screenshot3 = __esm({
|
|
|
4869
4905
|
definition5 = {
|
|
4870
4906
|
clearable: true,
|
|
4871
4907
|
name: "screenshot",
|
|
4872
|
-
description: "Capture a
|
|
4908
|
+
description: "Capture a screenshot of the current app preview and get it back with visual analysis. Choose `fullPage`: `false` captures just the visible viewport (fast \u2014 use it to review a specific section the page is scrolled to), `true` captures the entire page top-to-bottom (slower \u2014 use it to review overall composition or a layout you can't see in one screen). Use to review the current state of the UI being built. Remember, the screenshot analysis is not overly precise - for example, it cannot reliably identify specific fonts by name \u2014 it can only describe what letterforms look like.",
|
|
4873
4909
|
inputSchema: {
|
|
4874
4910
|
type: "object",
|
|
4875
4911
|
properties: {
|
|
4912
|
+
fullPage: {
|
|
4913
|
+
type: "boolean",
|
|
4914
|
+
description: "true = full-height capture of the entire page; false = just the visible viewport. Pick based on whether you need the whole page or a specific section."
|
|
4915
|
+
},
|
|
4876
4916
|
prompt: {
|
|
4877
4917
|
type: "string",
|
|
4878
4918
|
description: "Optional specific question about the screenshot. Use a bulleted list to ask many questions at once."
|
|
@@ -4883,9 +4923,10 @@ var init_screenshot3 = __esm({
|
|
|
4883
4923
|
},
|
|
4884
4924
|
instructions: {
|
|
4885
4925
|
type: "string",
|
|
4886
|
-
description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form,
|
|
4926
|
+
description: "If the screenshot you need requires interaction first (dismissing a modal, clicking a tab, filling out a form, scrolling to a specific section, getting through a login/auth checkpoint), describe the steps to get there. A browser automation agent will follow these instructions, then capture per your `fullPage` choice \u2014 so with `fullPage: false` you can scroll to a section and capture just that viewport. It can bypass auth and get right to where it needs to be if you tell it to authenticate as a test user and give it the path/screen to start at. Never describe what names or values to use when applying the instructions - the browser automation agent must use its own values for it to work properly. If a specific auth role is required to access the content, be sure to note that - it can automatically assume it for the purpose of testing."
|
|
4887
4927
|
}
|
|
4888
|
-
}
|
|
4928
|
+
},
|
|
4929
|
+
required: ["fullPage"]
|
|
4889
4930
|
}
|
|
4890
4931
|
};
|
|
4891
4932
|
}
|
|
@@ -43,7 +43,7 @@ Note: the snapshot concatenates inline text and strips whitespace. If you need t
|
|
|
43
43
|
- `navigate`: Navigate to a new URL within the app. Waits for the new page to load before continuing with subsequent steps. Use this instead of evaluate with `window.location.href` when you need to navigate and then continue interacting with the new page. Steps after navigate execute on the new page automatically.
|
|
44
44
|
- `evaluate`: Run arbitrary JavaScript in the page and return the result.
|
|
45
45
|
- `styles`: Read computed CSS styles from page elements. Pass a `properties` array with camelCase CSS property names (e.g., `["backgroundColor", "borderRadius", "fontSize"]`). Omit `properties` for a default set covering colors, typography, spacing, borders, shadows, dimensions, and layout. Uses the same targeting as click/type (ref, text, role, label, selector). Omit the target to get styles for all elements from the last snapshot.
|
|
46
|
-
- `screenshotViewport`: Take a screenshot of the
|
|
46
|
+
- `screenshotViewport`: Take a screenshot of the visible viewport. Returns CDN url with full text analysis and dimensions. To capture a specific section, set `scrollToSelector` (a CSS selector) — or `scrollY` (an absolute offset) — on this same step; it scrolls the target into view and captures it atomically, so you do NOT need a separate scroll step. Do not use if you can get what you need with other tools - only use when you need to visually see the viewport.
|
|
47
47
|
|
|
48
48
|
### Element targeting (tried in order)
|
|
49
49
|
|
|
@@ -109,6 +109,15 @@ Select a dropdown option and screenshot the result:
|
|
|
109
109
|
}
|
|
110
110
|
```
|
|
111
111
|
|
|
112
|
+
Capture a specific below-the-fold section (scroll + capture in one atomic step):
|
|
113
|
+
```json
|
|
114
|
+
{
|
|
115
|
+
"steps": [
|
|
116
|
+
{ "command": "screenshotViewport", "scrollToSelector": "#pricing" }
|
|
117
|
+
]
|
|
118
|
+
}
|
|
119
|
+
```
|
|
120
|
+
|
|
112
121
|
Navigate to a sub-page and interact with it:
|
|
113
122
|
```json
|
|
114
123
|
{
|
|
@@ -139,8 +148,10 @@ Check a count with evaluate:
|
|
|
139
148
|
```
|
|
140
149
|
</examples>
|
|
141
150
|
|
|
142
|
-
###
|
|
143
|
-
|
|
151
|
+
### Final Screenshot
|
|
152
|
+
How you take the final screenshot depends on what the task asked for:
|
|
153
|
+
- **Whole page** → use the standalone `screenshotFullPage` tool. It takes a full-height screenshot of the current page and returns the URL plus a full-text description.
|
|
154
|
+
- **A specific section / viewport** → use a `browserCommand` batch ending in a `screenshotViewport` step with `scrollToSelector` set to the section (e.g. `{ "command": "screenshotViewport", "scrollToSelector": "#pricing" }`). This scrolls the section into view and captures it in one atomic step. Do this rather than a separate scroll step followed by a capture — capturing the viewport is only reliable when the scroll and the shot are in the same step.
|
|
144
155
|
|
|
145
156
|
<rules>
|
|
146
157
|
- Always batch steps into a single browserCommand call. Don't send one step per turn. Type + click + wait should be one call, not three separate turns.
|