@mindstudio-ai/remy 0.1.193 → 0.1.195
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/headless.js +20 -19
- package/dist/index.js +20 -19
- package/dist/subagents/browserAutomation/prompt.md +13 -2
- package/package.json +1 -1
package/dist/headless.js
CHANGED
|
@@ -3709,7 +3709,7 @@ var BROWSER_TOOLS = [
|
|
|
3709
3709
|
"screenshotFullPage",
|
|
3710
3710
|
"screenshotViewport"
|
|
3711
3711
|
],
|
|
3712
|
-
description: "snapshot: accessibility tree of the page (waits for network to settle). click: click an element (animated cursor, full event sequence). type: type text into input (one char at a time, works with React/Vue/Svelte). select: select a dropdown option by text. wait: wait for an element to appear (polls 100ms, waits for network). navigate: navigate to a URL within the app (waits for load, subsequent steps run on new page). evaluate: run JS in the page. styles: read computed CSS styles from elements (pass properties array with camelCase names, or omit for defaults). screenshotFullPage: full-page viewport-stitched screenshot (returns CDN url with dimensions). screenshotViewport: screenshot of just the visible viewport."
|
|
3712
|
+
description: "snapshot: accessibility tree of the page (waits for network to settle). click: click an element (animated cursor, full event sequence). type: type text into input (one char at a time, works with React/Vue/Svelte). select: select a dropdown option by text. wait: wait for an element to appear (polls 100ms, waits for network). navigate: navigate to a URL within the app (waits for load, subsequent steps run on new page). evaluate: run JS in the page. styles: read computed CSS styles from elements (pass properties array with camelCase names, or omit for defaults). screenshotFullPage: full-page viewport-stitched screenshot (returns CDN url with dimensions). screenshotViewport: screenshot of just the visible viewport \u2014 pass `scrollToSelector` (or `scrollY`) on this step to scroll a section into view and capture it in one atomic step (no separate scroll needed)."
|
|
3713
3713
|
},
|
|
3714
3714
|
ref: {
|
|
3715
3715
|
type: "string",
|
|
@@ -3755,6 +3755,14 @@ var BROWSER_TOOLS = [
|
|
|
3755
3755
|
type: "array",
|
|
3756
3756
|
items: { type: "string" },
|
|
3757
3757
|
description: 'For styles: camelCase CSS property names to read (e.g., ["backgroundColor", "borderRadius", "fontSize"]). Omit for a default set.'
|
|
3758
|
+
},
|
|
3759
|
+
scrollToSelector: {
|
|
3760
|
+
type: "string",
|
|
3761
|
+
description: "For screenshotViewport: a CSS selector to scroll into view (via the capture\u2019s own context) immediately before the shot, so scroll + capture are atomic. Prefer this over a separate evaluate-scroll step when capturing a specific section."
|
|
3762
|
+
},
|
|
3763
|
+
scrollY: {
|
|
3764
|
+
type: "number",
|
|
3765
|
+
description: "For screenshotViewport: absolute Y offset to scroll to before the shot, when no selector is available."
|
|
3758
3766
|
}
|
|
3759
3767
|
},
|
|
3760
3768
|
required: ["command"]
|
|
@@ -3777,20 +3785,6 @@ var BROWSER_TOOLS = [
|
|
|
3777
3785
|
}
|
|
3778
3786
|
}
|
|
3779
3787
|
}
|
|
3780
|
-
},
|
|
3781
|
-
{
|
|
3782
|
-
clearable: true,
|
|
3783
|
-
name: "screenshotViewport",
|
|
3784
|
-
description: "Capture a screenshot of just the visible viewport (no full-page scroll/stitch). Returns a CDN URL with full text analysis and description. Use this when the goal is a specific section the page is currently scrolled to, rather than the whole page.",
|
|
3785
|
-
inputSchema: {
|
|
3786
|
-
type: "object",
|
|
3787
|
-
properties: {
|
|
3788
|
-
path: {
|
|
3789
|
-
type: "string",
|
|
3790
|
-
description: 'Navigate to this path before capturing (e.g. "/settings"). If omitted, screenshots the current page.'
|
|
3791
|
-
}
|
|
3792
|
-
}
|
|
3793
|
-
}
|
|
3794
3788
|
}
|
|
3795
3789
|
];
|
|
3796
3790
|
var BROWSER_EXTERNAL_TOOLS = /* @__PURE__ */ new Set(["browserCommand"]);
|
|
@@ -3894,6 +3888,7 @@ var ALLOWED_MODELS_BY_TYPE = {
|
|
|
3894
3888
|
"claude-4-7-opus",
|
|
3895
3889
|
"claude-4-6-opus",
|
|
3896
3890
|
"claude-4-6-sonnet",
|
|
3891
|
+
"claude-fable-5",
|
|
3897
3892
|
"gpt-5.5",
|
|
3898
3893
|
"gemini-3-pro",
|
|
3899
3894
|
"gemini-3.1-pro",
|
|
@@ -3913,6 +3908,7 @@ var log7 = createLogger("browser-automation");
|
|
|
3913
3908
|
async function runBrowserAutomation(task, context, opts) {
|
|
3914
3909
|
const release = await acquireBrowserLock();
|
|
3915
3910
|
try {
|
|
3911
|
+
let lastBrowserCommandViewport;
|
|
3916
3912
|
const result = await runSubAgent({
|
|
3917
3913
|
system: getBrowserAutomationPrompt(),
|
|
3918
3914
|
task,
|
|
@@ -3934,11 +3930,11 @@ async function runBrowserAutomation(task, context, opts) {
|
|
|
3934
3930
|
return `Error setting up browser: ${err.message}`;
|
|
3935
3931
|
}
|
|
3936
3932
|
}
|
|
3937
|
-
if (name === "screenshotFullPage"
|
|
3933
|
+
if (name === "screenshotFullPage") {
|
|
3938
3934
|
try {
|
|
3939
3935
|
return await captureAndAnalyzeScreenshot({
|
|
3940
3936
|
path: _input.path,
|
|
3941
|
-
fullPage:
|
|
3937
|
+
fullPage: true,
|
|
3942
3938
|
onLog,
|
|
3943
3939
|
model: resolveModel(
|
|
3944
3940
|
"imageAnalysis",
|
|
@@ -3971,6 +3967,11 @@ async function runBrowserAutomation(task, context, opts) {
|
|
|
3971
3967
|
(s) => s.command === "screenshotViewport" && s.result?.url
|
|
3972
3968
|
);
|
|
3973
3969
|
if (screenshotSteps.length > 0) {
|
|
3970
|
+
const lastStep = screenshotSteps[screenshotSteps.length - 1];
|
|
3971
|
+
lastBrowserCommandViewport = {
|
|
3972
|
+
url: lastStep.result.url,
|
|
3973
|
+
styleMap: lastStep.result.styleMap
|
|
3974
|
+
};
|
|
3974
3975
|
const visionOverride = {
|
|
3975
3976
|
model: resolveModel(
|
|
3976
3977
|
"imageAnalysis",
|
|
@@ -4014,11 +4015,11 @@ async function runBrowserAutomation(task, context, opts) {
|
|
|
4014
4015
|
return result2;
|
|
4015
4016
|
},
|
|
4016
4017
|
toolRegistry: context.toolRegistry,
|
|
4017
|
-
captureArtifacts: ["screenshotFullPage"
|
|
4018
|
+
captureArtifacts: ["screenshotFullPage"]
|
|
4018
4019
|
});
|
|
4019
4020
|
context.subAgentMessages?.set(context.toolCallId, result.messages);
|
|
4020
|
-
const viewport = result.artifacts?.screenshotViewport;
|
|
4021
4021
|
const fullPage = result.artifacts?.screenshotFullPage;
|
|
4022
|
+
const viewport = lastBrowserCommandViewport;
|
|
4022
4023
|
const preferred = opts?.capture === "viewport" ? viewport ?? fullPage : fullPage ?? viewport;
|
|
4023
4024
|
return {
|
|
4024
4025
|
text: result.text,
|
package/dist/index.js
CHANGED
|
@@ -2112,6 +2112,7 @@ var init_surfaces = __esm({
|
|
|
2112
2112
|
"claude-4-7-opus",
|
|
2113
2113
|
"claude-4-6-opus",
|
|
2114
2114
|
"claude-4-6-sonnet",
|
|
2115
|
+
"claude-fable-5",
|
|
2115
2116
|
"gpt-5.5",
|
|
2116
2117
|
"gemini-3-pro",
|
|
2117
2118
|
"gemini-3.1-pro",
|
|
@@ -4167,7 +4168,7 @@ var init_tools = __esm({
|
|
|
4167
4168
|
"screenshotFullPage",
|
|
4168
4169
|
"screenshotViewport"
|
|
4169
4170
|
],
|
|
4170
|
-
description: "snapshot: accessibility tree of the page (waits for network to settle). click: click an element (animated cursor, full event sequence). type: type text into input (one char at a time, works with React/Vue/Svelte). select: select a dropdown option by text. wait: wait for an element to appear (polls 100ms, waits for network). navigate: navigate to a URL within the app (waits for load, subsequent steps run on new page). evaluate: run JS in the page. styles: read computed CSS styles from elements (pass properties array with camelCase names, or omit for defaults). screenshotFullPage: full-page viewport-stitched screenshot (returns CDN url with dimensions). screenshotViewport: screenshot of just the visible viewport."
|
|
4171
|
+
description: "snapshot: accessibility tree of the page (waits for network to settle). click: click an element (animated cursor, full event sequence). type: type text into input (one char at a time, works with React/Vue/Svelte). select: select a dropdown option by text. wait: wait for an element to appear (polls 100ms, waits for network). navigate: navigate to a URL within the app (waits for load, subsequent steps run on new page). evaluate: run JS in the page. styles: read computed CSS styles from elements (pass properties array with camelCase names, or omit for defaults). screenshotFullPage: full-page viewport-stitched screenshot (returns CDN url with dimensions). screenshotViewport: screenshot of just the visible viewport \u2014 pass `scrollToSelector` (or `scrollY`) on this step to scroll a section into view and capture it in one atomic step (no separate scroll needed)."
|
|
4171
4172
|
},
|
|
4172
4173
|
ref: {
|
|
4173
4174
|
type: "string",
|
|
@@ -4213,6 +4214,14 @@ var init_tools = __esm({
|
|
|
4213
4214
|
type: "array",
|
|
4214
4215
|
items: { type: "string" },
|
|
4215
4216
|
description: 'For styles: camelCase CSS property names to read (e.g., ["backgroundColor", "borderRadius", "fontSize"]). Omit for a default set.'
|
|
4217
|
+
},
|
|
4218
|
+
scrollToSelector: {
|
|
4219
|
+
type: "string",
|
|
4220
|
+
description: "For screenshotViewport: a CSS selector to scroll into view (via the capture\u2019s own context) immediately before the shot, so scroll + capture are atomic. Prefer this over a separate evaluate-scroll step when capturing a specific section."
|
|
4221
|
+
},
|
|
4222
|
+
scrollY: {
|
|
4223
|
+
type: "number",
|
|
4224
|
+
description: "For screenshotViewport: absolute Y offset to scroll to before the shot, when no selector is available."
|
|
4216
4225
|
}
|
|
4217
4226
|
},
|
|
4218
4227
|
required: ["command"]
|
|
@@ -4235,20 +4244,6 @@ var init_tools = __esm({
|
|
|
4235
4244
|
}
|
|
4236
4245
|
}
|
|
4237
4246
|
}
|
|
4238
|
-
},
|
|
4239
|
-
{
|
|
4240
|
-
clearable: true,
|
|
4241
|
-
name: "screenshotViewport",
|
|
4242
|
-
description: "Capture a screenshot of just the visible viewport (no full-page scroll/stitch). Returns a CDN URL with full text analysis and description. Use this when the goal is a specific section the page is currently scrolled to, rather than the whole page.",
|
|
4243
|
-
inputSchema: {
|
|
4244
|
-
type: "object",
|
|
4245
|
-
properties: {
|
|
4246
|
-
path: {
|
|
4247
|
-
type: "string",
|
|
4248
|
-
description: 'Navigate to this path before capturing (e.g. "/settings"). If omitted, screenshots the current page.'
|
|
4249
|
-
}
|
|
4250
|
-
}
|
|
4251
|
-
}
|
|
4252
4247
|
}
|
|
4253
4248
|
];
|
|
4254
4249
|
BROWSER_EXTERNAL_TOOLS = /* @__PURE__ */ new Set(["browserCommand"]);
|
|
@@ -4284,6 +4279,7 @@ var init_prompt2 = __esm({
|
|
|
4284
4279
|
async function runBrowserAutomation(task, context, opts) {
|
|
4285
4280
|
const release = await acquireBrowserLock();
|
|
4286
4281
|
try {
|
|
4282
|
+
let lastBrowserCommandViewport;
|
|
4287
4283
|
const result = await runSubAgent({
|
|
4288
4284
|
system: getBrowserAutomationPrompt(),
|
|
4289
4285
|
task,
|
|
@@ -4305,11 +4301,11 @@ async function runBrowserAutomation(task, context, opts) {
|
|
|
4305
4301
|
return `Error setting up browser: ${err.message}`;
|
|
4306
4302
|
}
|
|
4307
4303
|
}
|
|
4308
|
-
if (name === "screenshotFullPage"
|
|
4304
|
+
if (name === "screenshotFullPage") {
|
|
4309
4305
|
try {
|
|
4310
4306
|
return await captureAndAnalyzeScreenshot({
|
|
4311
4307
|
path: _input.path,
|
|
4312
|
-
fullPage:
|
|
4308
|
+
fullPage: true,
|
|
4313
4309
|
onLog,
|
|
4314
4310
|
model: resolveModel(
|
|
4315
4311
|
"imageAnalysis",
|
|
@@ -4342,6 +4338,11 @@ async function runBrowserAutomation(task, context, opts) {
|
|
|
4342
4338
|
(s) => s.command === "screenshotViewport" && s.result?.url
|
|
4343
4339
|
);
|
|
4344
4340
|
if (screenshotSteps.length > 0) {
|
|
4341
|
+
const lastStep = screenshotSteps[screenshotSteps.length - 1];
|
|
4342
|
+
lastBrowserCommandViewport = {
|
|
4343
|
+
url: lastStep.result.url,
|
|
4344
|
+
styleMap: lastStep.result.styleMap
|
|
4345
|
+
};
|
|
4345
4346
|
const visionOverride = {
|
|
4346
4347
|
model: resolveModel(
|
|
4347
4348
|
"imageAnalysis",
|
|
@@ -4385,11 +4386,11 @@ async function runBrowserAutomation(task, context, opts) {
|
|
|
4385
4386
|
return result2;
|
|
4386
4387
|
},
|
|
4387
4388
|
toolRegistry: context.toolRegistry,
|
|
4388
|
-
captureArtifacts: ["screenshotFullPage"
|
|
4389
|
+
captureArtifacts: ["screenshotFullPage"]
|
|
4389
4390
|
});
|
|
4390
4391
|
context.subAgentMessages?.set(context.toolCallId, result.messages);
|
|
4391
|
-
const viewport = result.artifacts?.screenshotViewport;
|
|
4392
4392
|
const fullPage = result.artifacts?.screenshotFullPage;
|
|
4393
|
+
const viewport = lastBrowserCommandViewport;
|
|
4393
4394
|
const preferred = opts?.capture === "viewport" ? viewport ?? fullPage : fullPage ?? viewport;
|
|
4394
4395
|
return {
|
|
4395
4396
|
text: result.text,
|
|
@@ -43,7 +43,7 @@ Note: the snapshot concatenates inline text and strips whitespace. If you need t
|
|
|
43
43
|
- `navigate`: Navigate to a new URL within the app. Waits for the new page to load before continuing with subsequent steps. Use this instead of evaluate with `window.location.href` when you need to navigate and then continue interacting with the new page. Steps after navigate execute on the new page automatically.
|
|
44
44
|
- `evaluate`: Run arbitrary JavaScript in the page and return the result.
|
|
45
45
|
- `styles`: Read computed CSS styles from page elements. Pass a `properties` array with camelCase CSS property names (e.g., `["backgroundColor", "borderRadius", "fontSize"]`). Omit `properties` for a default set covering colors, typography, spacing, borders, shadows, dimensions, and layout. Uses the same targeting as click/type (ref, text, role, label, selector). Omit the target to get styles for all elements from the last snapshot.
|
|
46
|
-
- `screenshotViewport`: Take a screenshot of the
|
|
46
|
+
- `screenshotViewport`: Take a screenshot of the visible viewport. Returns CDN url with full text analysis and dimensions. To capture a specific section, set `scrollToSelector` (a CSS selector) — or `scrollY` (an absolute offset) — on this same step; it scrolls the target into view and captures it atomically, so you do NOT need a separate scroll step. Do not use if you can get what you need with other tools - only use when you need to visually see the viewport.
|
|
47
47
|
|
|
48
48
|
### Element targeting (tried in order)
|
|
49
49
|
|
|
@@ -109,6 +109,15 @@ Select a dropdown option and screenshot the result:
|
|
|
109
109
|
}
|
|
110
110
|
```
|
|
111
111
|
|
|
112
|
+
Capture a specific below-the-fold section (scroll + capture in one atomic step):
|
|
113
|
+
```json
|
|
114
|
+
{
|
|
115
|
+
"steps": [
|
|
116
|
+
{ "command": "screenshotViewport", "scrollToSelector": "#pricing" }
|
|
117
|
+
]
|
|
118
|
+
}
|
|
119
|
+
```
|
|
120
|
+
|
|
112
121
|
Navigate to a sub-page and interact with it:
|
|
113
122
|
```json
|
|
114
123
|
{
|
|
@@ -140,7 +149,9 @@ Check a count with evaluate:
|
|
|
140
149
|
</examples>
|
|
141
150
|
|
|
142
151
|
### Final Screenshot
|
|
143
|
-
|
|
152
|
+
How you take the final screenshot depends on what the task asked for:
|
|
153
|
+
- **Whole page** → use the standalone `screenshotFullPage` tool. It takes a full-height screenshot of the current page and returns the URL plus a full-text description.
|
|
154
|
+
- **A specific section / viewport** → use a `browserCommand` batch ending in a `screenshotViewport` step with `scrollToSelector` set to the section (e.g. `{ "command": "screenshotViewport", "scrollToSelector": "#pricing" }`). This scrolls the section into view and captures it in one atomic step. Do this rather than a separate scroll step followed by a capture — capturing the viewport is only reliable when the scroll and the shot are in the same step.
|
|
144
155
|
|
|
145
156
|
<rules>
|
|
146
157
|
- Always batch steps into a single browserCommand call. Don't send one step per turn. Type + click + wait should be one call, not three separate turns.
|