@mindstudio-ai/remy 0.1.179 → 0.1.180

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/dist/headless.js +130 -141
  2. package/dist/index.js +130 -141
  3. package/package.json +1 -1
package/dist/headless.js CHANGED
@@ -3813,6 +3813,121 @@ function resolveModel(surfaceId, models, fallback) {
3813
3813
 
3814
3814
  // src/subagents/browserAutomation/index.ts
3815
3815
  var log6 = createLogger("browser-automation");
3816
+ async function runBrowserAutomation(task, context) {
3817
+ const release = await acquireBrowserLock();
3818
+ try {
3819
+ const result = await runSubAgent({
3820
+ system: getBrowserAutomationPrompt(),
3821
+ task,
3822
+ tools: BROWSER_TOOLS,
3823
+ externalTools: BROWSER_EXTERNAL_TOOLS,
3824
+ executeTool: async (name, _input, _toolCallId, onLog) => {
3825
+ if (name === "setupBrowser") {
3826
+ try {
3827
+ const result2 = await sidecarRequest(
3828
+ "/setup-browser",
3829
+ {
3830
+ auth: _input.auth,
3831
+ path: _input.path
3832
+ },
3833
+ { timeout: 15e3 }
3834
+ );
3835
+ return JSON.stringify(result2);
3836
+ } catch (err) {
3837
+ return `Error setting up browser: ${err.message}`;
3838
+ }
3839
+ }
3840
+ if (name === "screenshotFullPage") {
3841
+ try {
3842
+ return await captureAndAnalyzeScreenshot({
3843
+ path: _input.path,
3844
+ onLog,
3845
+ model: resolveModel(
3846
+ "imageAnalysis",
3847
+ context.models,
3848
+ context.model
3849
+ )
3850
+ });
3851
+ } catch (err) {
3852
+ return `Error taking screenshot: ${err.message}`;
3853
+ }
3854
+ }
3855
+ return `Error: unknown local tool "${name}"`;
3856
+ },
3857
+ apiConfig: context.apiConfig,
3858
+ model: resolveModel("browserAutomation", context.models, context.model),
3859
+ subAgentId: "browserAutomation",
3860
+ signal: context.signal,
3861
+ parentToolId: context.toolCallId,
3862
+ requestId: context.requestId,
3863
+ onEvent: context.onEvent,
3864
+ resolveExternalTool: async (id, name, input) => {
3865
+ if (!context.resolveExternalTool) {
3866
+ return "Error: no external tool resolver";
3867
+ }
3868
+ const result2 = await context.resolveExternalTool(id, name, input);
3869
+ if (name === "browserCommand") {
3870
+ try {
3871
+ const parsed = JSON.parse(result2);
3872
+ const screenshotSteps = (parsed.steps || []).filter(
3873
+ (s) => s.command === "screenshotViewport" && s.result?.url
3874
+ );
3875
+ if (screenshotSteps.length > 0) {
3876
+ const visionOverride = {
3877
+ model: resolveModel(
3878
+ "imageAnalysis",
3879
+ context.models,
3880
+ context.model
3881
+ )
3882
+ };
3883
+ const batchInput = screenshotSteps.map((s) => ({
3884
+ stepType: "analyzeImage",
3885
+ step: {
3886
+ imageUrl: s.result.url,
3887
+ prompt: buildScreenshotAnalysisPrompt({
3888
+ styleMap: s.result.styleMap
3889
+ }),
3890
+ visionModelOverride: visionOverride
3891
+ }
3892
+ }));
3893
+ const batchResult = await runMindstudioCli(
3894
+ ["batch", JSON.stringify(batchInput)],
3895
+ { timeout: 2e5, caller: "browserAutomation" }
3896
+ );
3897
+ try {
3898
+ const analyses = JSON.parse(batchResult);
3899
+ let ai = 0;
3900
+ for (const step of parsed.steps) {
3901
+ if (step.command === "screenshotViewport" && step.result?.url && ai < analyses.length) {
3902
+ step.result.analysis = analyses[ai]?.output?.analysis || analyses[ai]?.output || "";
3903
+ ai++;
3904
+ }
3905
+ }
3906
+ } catch {
3907
+ log6.debug("Failed to parse batch analysis result", {
3908
+ batchResult
3909
+ });
3910
+ }
3911
+ return JSON.stringify(parsed);
3912
+ }
3913
+ } catch {
3914
+ }
3915
+ }
3916
+ return result2;
3917
+ },
3918
+ toolRegistry: context.toolRegistry,
3919
+ captureArtifacts: ["screenshotFullPage"]
3920
+ });
3921
+ context.subAgentMessages?.set(context.toolCallId, result.messages);
3922
+ const ss = result.artifacts?.screenshotFullPage;
3923
+ return {
3924
+ text: result.text,
3925
+ ...ss?.url ? { screenshot: { url: ss.url, styleMap: ss.styleMap } } : {}
3926
+ };
3927
+ } finally {
3928
+ release();
3929
+ }
3930
+ }
3816
3931
  var browserAutomationTool = {
3817
3932
  clearable: true,
3818
3933
  definition: {
@@ -3833,121 +3948,13 @@ var browserAutomationTool = {
3833
3948
  if (!context) {
3834
3949
  return "Error: browser automation requires execution context (only available in headless mode)";
3835
3950
  }
3836
- const release = await acquireBrowserLock();
3837
- try {
3838
- const result = await runSubAgent({
3839
- system: getBrowserAutomationPrompt(),
3840
- task: input.task,
3841
- tools: BROWSER_TOOLS,
3842
- externalTools: BROWSER_EXTERNAL_TOOLS,
3843
- executeTool: async (name, _input, _toolCallId, onLog) => {
3844
- if (name === "setupBrowser") {
3845
- try {
3846
- const result2 = await sidecarRequest(
3847
- "/setup-browser",
3848
- {
3849
- auth: _input.auth,
3850
- path: _input.path
3851
- },
3852
- { timeout: 15e3 }
3853
- );
3854
- return JSON.stringify(result2);
3855
- } catch (err) {
3856
- return `Error setting up browser: ${err.message}`;
3857
- }
3858
- }
3859
- if (name === "screenshotFullPage") {
3860
- try {
3861
- return await captureAndAnalyzeScreenshot({
3862
- path: _input.path,
3863
- onLog,
3864
- model: resolveModel(
3865
- "imageAnalysis",
3866
- context.models,
3867
- context.model
3868
- )
3869
- });
3870
- } catch (err) {
3871
- return `Error taking screenshot: ${err.message}`;
3872
- }
3873
- }
3874
- return `Error: unknown local tool "${name}"`;
3875
- },
3876
- apiConfig: context.apiConfig,
3877
- model: resolveModel("browserAutomation", context.models, context.model),
3878
- subAgentId: "browserAutomation",
3879
- signal: context.signal,
3880
- parentToolId: context.toolCallId,
3881
- requestId: context.requestId,
3882
- onEvent: context.onEvent,
3883
- resolveExternalTool: async (id, name, input2) => {
3884
- if (!context.resolveExternalTool) {
3885
- return "Error: no external tool resolver";
3886
- }
3887
- const result2 = await context.resolveExternalTool(id, name, input2);
3888
- if (name === "browserCommand") {
3889
- try {
3890
- const parsed = JSON.parse(result2);
3891
- const screenshotSteps = (parsed.steps || []).filter(
3892
- (s) => s.command === "screenshotViewport" && s.result?.url
3893
- );
3894
- if (screenshotSteps.length > 0) {
3895
- const visionOverride = {
3896
- model: resolveModel(
3897
- "imageAnalysis",
3898
- context.models,
3899
- context.model
3900
- )
3901
- };
3902
- const batchInput = screenshotSteps.map((s) => ({
3903
- stepType: "analyzeImage",
3904
- step: {
3905
- imageUrl: s.result.url,
3906
- prompt: buildScreenshotAnalysisPrompt({
3907
- styleMap: s.result.styleMap
3908
- }),
3909
- visionModelOverride: visionOverride
3910
- }
3911
- }));
3912
- const batchResult = await runMindstudioCli(
3913
- ["batch", JSON.stringify(batchInput)],
3914
- { timeout: 2e5, caller: "browserAutomation" }
3915
- );
3916
- try {
3917
- const analyses = JSON.parse(batchResult);
3918
- let ai = 0;
3919
- for (const step of parsed.steps) {
3920
- if (step.command === "screenshotViewport" && step.result?.url && ai < analyses.length) {
3921
- step.result.analysis = analyses[ai]?.output?.analysis || analyses[ai]?.output || "";
3922
- ai++;
3923
- }
3924
- }
3925
- } catch {
3926
- log6.debug("Failed to parse batch analysis result", {
3927
- batchResult
3928
- });
3929
- }
3930
- return JSON.stringify(parsed);
3931
- }
3932
- } catch {
3933
- }
3934
- }
3935
- return result2;
3936
- },
3937
- toolRegistry: context.toolRegistry,
3938
- captureArtifacts: ["screenshotFullPage"]
3939
- });
3940
- context.subAgentMessages?.set(context.toolCallId, result.messages);
3941
- const ss = result.artifacts?.screenshotFullPage;
3942
- if (ss?.url) {
3943
- return `${result.text}
3951
+ const result = await runBrowserAutomation(input.task, context);
3952
+ if (result.screenshot) {
3953
+ return `${result.text}
3944
3954
 
3945
- ![Final state](${ss.url})`;
3946
- }
3947
- return result.text;
3948
- } finally {
3949
- release();
3955
+ ![Final state](${result.screenshot.url})`;
3950
3956
  }
3957
+ return result.text;
3951
3958
  }
3952
3959
  };
3953
3960
 
@@ -3991,23 +3998,14 @@ var screenshotTool = {
3991
3998
  }
3992
3999
  if (input.instructions && context) {
3993
4000
  const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a full-page screenshot.` : `${input.instructions}. After completing these steps, take a full-page screenshot.`;
3994
- const result = await browserAutomationTool.execute({ task }, context);
3995
- const resultStr = result;
3996
- let url;
3997
- let styleMap;
3998
- try {
3999
- const parsed = JSON.parse(resultStr);
4000
- url = parsed.screenshotUrl;
4001
- styleMap = parsed.styleMap;
4002
- } catch {
4003
- }
4004
- if (!url) {
4005
- return `Error: browser navigation completed but no screenshot URL was returned. Agent output: ${resultStr}`;
4001
+ const result = await runBrowserAutomation(task, context);
4002
+ if (!result.screenshot) {
4003
+ return result.text;
4006
4004
  }
4007
4005
  return await streamScreenshotAnalysis({
4008
- url,
4006
+ url: result.screenshot.url,
4009
4007
  prompt: input.prompt,
4010
- styleMap,
4008
+ styleMap: result.screenshot.styleMap,
4011
4009
  onLog: context?.onLog,
4012
4010
  model: resolveModel("imageAnalysis", context?.models, context?.model)
4013
4011
  });
@@ -4339,23 +4337,14 @@ async function execute5(input, onLog, context) {
4339
4337
  if (input.instructions && context) {
4340
4338
  try {
4341
4339
  const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a full-page screenshot.` : `${input.instructions}. After completing these steps, take a full-page screenshot.`;
4342
- const result = await browserAutomationTool.execute({ task }, context);
4343
- const resultStr = result;
4344
- let url;
4345
- let styleMap;
4346
- try {
4347
- const parsed = JSON.parse(resultStr);
4348
- url = parsed.screenshotUrl;
4349
- styleMap = parsed.styleMap;
4350
- } catch {
4351
- }
4352
- if (!url) {
4353
- return `Error: browser navigation completed but no screenshot URL was returned. Agent output: ${resultStr}`;
4340
+ const result = await runBrowserAutomation(task, context);
4341
+ if (!result.screenshot) {
4342
+ return result.text;
4354
4343
  }
4355
4344
  return await streamScreenshotAnalysis({
4356
- url,
4345
+ url: result.screenshot.url,
4357
4346
  prompt: input.prompt,
4358
- styleMap,
4347
+ styleMap: result.screenshot.styleMap,
4359
4348
  onLog,
4360
4349
  model: resolveModel("imageAnalysis", context?.models, context?.model)
4361
4350
  });
package/dist/index.js CHANGED
@@ -4176,6 +4176,121 @@ var init_prompt2 = __esm({
4176
4176
  });
4177
4177
 
4178
4178
  // src/subagents/browserAutomation/index.ts
4179
+ async function runBrowserAutomation(task, context) {
4180
+ const release = await acquireBrowserLock();
4181
+ try {
4182
+ const result = await runSubAgent({
4183
+ system: getBrowserAutomationPrompt(),
4184
+ task,
4185
+ tools: BROWSER_TOOLS,
4186
+ externalTools: BROWSER_EXTERNAL_TOOLS,
4187
+ executeTool: async (name, _input, _toolCallId, onLog) => {
4188
+ if (name === "setupBrowser") {
4189
+ try {
4190
+ const result2 = await sidecarRequest(
4191
+ "/setup-browser",
4192
+ {
4193
+ auth: _input.auth,
4194
+ path: _input.path
4195
+ },
4196
+ { timeout: 15e3 }
4197
+ );
4198
+ return JSON.stringify(result2);
4199
+ } catch (err) {
4200
+ return `Error setting up browser: ${err.message}`;
4201
+ }
4202
+ }
4203
+ if (name === "screenshotFullPage") {
4204
+ try {
4205
+ return await captureAndAnalyzeScreenshot({
4206
+ path: _input.path,
4207
+ onLog,
4208
+ model: resolveModel(
4209
+ "imageAnalysis",
4210
+ context.models,
4211
+ context.model
4212
+ )
4213
+ });
4214
+ } catch (err) {
4215
+ return `Error taking screenshot: ${err.message}`;
4216
+ }
4217
+ }
4218
+ return `Error: unknown local tool "${name}"`;
4219
+ },
4220
+ apiConfig: context.apiConfig,
4221
+ model: resolveModel("browserAutomation", context.models, context.model),
4222
+ subAgentId: "browserAutomation",
4223
+ signal: context.signal,
4224
+ parentToolId: context.toolCallId,
4225
+ requestId: context.requestId,
4226
+ onEvent: context.onEvent,
4227
+ resolveExternalTool: async (id, name, input) => {
4228
+ if (!context.resolveExternalTool) {
4229
+ return "Error: no external tool resolver";
4230
+ }
4231
+ const result2 = await context.resolveExternalTool(id, name, input);
4232
+ if (name === "browserCommand") {
4233
+ try {
4234
+ const parsed = JSON.parse(result2);
4235
+ const screenshotSteps = (parsed.steps || []).filter(
4236
+ (s) => s.command === "screenshotViewport" && s.result?.url
4237
+ );
4238
+ if (screenshotSteps.length > 0) {
4239
+ const visionOverride = {
4240
+ model: resolveModel(
4241
+ "imageAnalysis",
4242
+ context.models,
4243
+ context.model
4244
+ )
4245
+ };
4246
+ const batchInput = screenshotSteps.map((s) => ({
4247
+ stepType: "analyzeImage",
4248
+ step: {
4249
+ imageUrl: s.result.url,
4250
+ prompt: buildScreenshotAnalysisPrompt({
4251
+ styleMap: s.result.styleMap
4252
+ }),
4253
+ visionModelOverride: visionOverride
4254
+ }
4255
+ }));
4256
+ const batchResult = await runMindstudioCli(
4257
+ ["batch", JSON.stringify(batchInput)],
4258
+ { timeout: 2e5, caller: "browserAutomation" }
4259
+ );
4260
+ try {
4261
+ const analyses = JSON.parse(batchResult);
4262
+ let ai = 0;
4263
+ for (const step of parsed.steps) {
4264
+ if (step.command === "screenshotViewport" && step.result?.url && ai < analyses.length) {
4265
+ step.result.analysis = analyses[ai]?.output?.analysis || analyses[ai]?.output || "";
4266
+ ai++;
4267
+ }
4268
+ }
4269
+ } catch {
4270
+ log6.debug("Failed to parse batch analysis result", {
4271
+ batchResult
4272
+ });
4273
+ }
4274
+ return JSON.stringify(parsed);
4275
+ }
4276
+ } catch {
4277
+ }
4278
+ }
4279
+ return result2;
4280
+ },
4281
+ toolRegistry: context.toolRegistry,
4282
+ captureArtifacts: ["screenshotFullPage"]
4283
+ });
4284
+ context.subAgentMessages?.set(context.toolCallId, result.messages);
4285
+ const ss = result.artifacts?.screenshotFullPage;
4286
+ return {
4287
+ text: result.text,
4288
+ ...ss?.url ? { screenshot: { url: ss.url, styleMap: ss.styleMap } } : {}
4289
+ };
4290
+ } finally {
4291
+ release();
4292
+ }
4293
+ }
4179
4294
  var log6, browserAutomationTool;
4180
4295
  var init_browserAutomation = __esm({
4181
4296
  "src/subagents/browserAutomation/index.ts"() {
@@ -4210,121 +4325,13 @@ var init_browserAutomation = __esm({
4210
4325
  if (!context) {
4211
4326
  return "Error: browser automation requires execution context (only available in headless mode)";
4212
4327
  }
4213
- const release = await acquireBrowserLock();
4214
- try {
4215
- const result = await runSubAgent({
4216
- system: getBrowserAutomationPrompt(),
4217
- task: input.task,
4218
- tools: BROWSER_TOOLS,
4219
- externalTools: BROWSER_EXTERNAL_TOOLS,
4220
- executeTool: async (name, _input, _toolCallId, onLog) => {
4221
- if (name === "setupBrowser") {
4222
- try {
4223
- const result2 = await sidecarRequest(
4224
- "/setup-browser",
4225
- {
4226
- auth: _input.auth,
4227
- path: _input.path
4228
- },
4229
- { timeout: 15e3 }
4230
- );
4231
- return JSON.stringify(result2);
4232
- } catch (err) {
4233
- return `Error setting up browser: ${err.message}`;
4234
- }
4235
- }
4236
- if (name === "screenshotFullPage") {
4237
- try {
4238
- return await captureAndAnalyzeScreenshot({
4239
- path: _input.path,
4240
- onLog,
4241
- model: resolveModel(
4242
- "imageAnalysis",
4243
- context.models,
4244
- context.model
4245
- )
4246
- });
4247
- } catch (err) {
4248
- return `Error taking screenshot: ${err.message}`;
4249
- }
4250
- }
4251
- return `Error: unknown local tool "${name}"`;
4252
- },
4253
- apiConfig: context.apiConfig,
4254
- model: resolveModel("browserAutomation", context.models, context.model),
4255
- subAgentId: "browserAutomation",
4256
- signal: context.signal,
4257
- parentToolId: context.toolCallId,
4258
- requestId: context.requestId,
4259
- onEvent: context.onEvent,
4260
- resolveExternalTool: async (id, name, input2) => {
4261
- if (!context.resolveExternalTool) {
4262
- return "Error: no external tool resolver";
4263
- }
4264
- const result2 = await context.resolveExternalTool(id, name, input2);
4265
- if (name === "browserCommand") {
4266
- try {
4267
- const parsed = JSON.parse(result2);
4268
- const screenshotSteps = (parsed.steps || []).filter(
4269
- (s) => s.command === "screenshotViewport" && s.result?.url
4270
- );
4271
- if (screenshotSteps.length > 0) {
4272
- const visionOverride = {
4273
- model: resolveModel(
4274
- "imageAnalysis",
4275
- context.models,
4276
- context.model
4277
- )
4278
- };
4279
- const batchInput = screenshotSteps.map((s) => ({
4280
- stepType: "analyzeImage",
4281
- step: {
4282
- imageUrl: s.result.url,
4283
- prompt: buildScreenshotAnalysisPrompt({
4284
- styleMap: s.result.styleMap
4285
- }),
4286
- visionModelOverride: visionOverride
4287
- }
4288
- }));
4289
- const batchResult = await runMindstudioCli(
4290
- ["batch", JSON.stringify(batchInput)],
4291
- { timeout: 2e5, caller: "browserAutomation" }
4292
- );
4293
- try {
4294
- const analyses = JSON.parse(batchResult);
4295
- let ai = 0;
4296
- for (const step of parsed.steps) {
4297
- if (step.command === "screenshotViewport" && step.result?.url && ai < analyses.length) {
4298
- step.result.analysis = analyses[ai]?.output?.analysis || analyses[ai]?.output || "";
4299
- ai++;
4300
- }
4301
- }
4302
- } catch {
4303
- log6.debug("Failed to parse batch analysis result", {
4304
- batchResult
4305
- });
4306
- }
4307
- return JSON.stringify(parsed);
4308
- }
4309
- } catch {
4310
- }
4311
- }
4312
- return result2;
4313
- },
4314
- toolRegistry: context.toolRegistry,
4315
- captureArtifacts: ["screenshotFullPage"]
4316
- });
4317
- context.subAgentMessages?.set(context.toolCallId, result.messages);
4318
- const ss = result.artifacts?.screenshotFullPage;
4319
- if (ss?.url) {
4320
- return `${result.text}
4328
+ const result = await runBrowserAutomation(input.task, context);
4329
+ if (result.screenshot) {
4330
+ return `${result.text}
4321
4331
 
4322
- ![Final state](${ss.url})`;
4323
- }
4324
- return result.text;
4325
- } finally {
4326
- release();
4332
+ ![Final state](${result.screenshot.url})`;
4327
4333
  }
4334
+ return result.text;
4328
4335
  }
4329
4336
  };
4330
4337
  }
@@ -4378,23 +4385,14 @@ var init_screenshot2 = __esm({
4378
4385
  }
4379
4386
  if (input.instructions && context) {
4380
4387
  const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a full-page screenshot.` : `${input.instructions}. After completing these steps, take a full-page screenshot.`;
4381
- const result = await browserAutomationTool.execute({ task }, context);
4382
- const resultStr = result;
4383
- let url;
4384
- let styleMap;
4385
- try {
4386
- const parsed = JSON.parse(resultStr);
4387
- url = parsed.screenshotUrl;
4388
- styleMap = parsed.styleMap;
4389
- } catch {
4390
- }
4391
- if (!url) {
4392
- return `Error: browser navigation completed but no screenshot URL was returned. Agent output: ${resultStr}`;
4388
+ const result = await runBrowserAutomation(task, context);
4389
+ if (!result.screenshot) {
4390
+ return result.text;
4393
4391
  }
4394
4392
  return await streamScreenshotAnalysis({
4395
- url,
4393
+ url: result.screenshot.url,
4396
4394
  prompt: input.prompt,
4397
- styleMap,
4395
+ styleMap: result.screenshot.styleMap,
4398
4396
  onLog: context?.onLog,
4399
4397
  model: resolveModel("imageAnalysis", context?.models, context?.model)
4400
4398
  });
@@ -4744,23 +4742,14 @@ async function execute5(input, onLog, context) {
4744
4742
  if (input.instructions && context) {
4745
4743
  try {
4746
4744
  const task = input.path ? `Navigate to "${input.path}", then: ${input.instructions}. After completing these steps, take a full-page screenshot.` : `${input.instructions}. After completing these steps, take a full-page screenshot.`;
4747
- const result = await browserAutomationTool.execute({ task }, context);
4748
- const resultStr = result;
4749
- let url;
4750
- let styleMap;
4751
- try {
4752
- const parsed = JSON.parse(resultStr);
4753
- url = parsed.screenshotUrl;
4754
- styleMap = parsed.styleMap;
4755
- } catch {
4756
- }
4757
- if (!url) {
4758
- return `Error: browser navigation completed but no screenshot URL was returned. Agent output: ${resultStr}`;
4745
+ const result = await runBrowserAutomation(task, context);
4746
+ if (!result.screenshot) {
4747
+ return result.text;
4759
4748
  }
4760
4749
  return await streamScreenshotAnalysis({
4761
- url,
4750
+ url: result.screenshot.url,
4762
4751
  prompt: input.prompt,
4763
- styleMap,
4752
+ styleMap: result.screenshot.styleMap,
4764
4753
  onLog,
4765
4754
  model: resolveModel("imageAnalysis", context?.models, context?.model)
4766
4755
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mindstudio-ai/remy",
3
- "version": "0.1.179",
3
+ "version": "0.1.180",
4
4
  "description": "MindStudio coding agent",
5
5
  "repository": {
6
6
  "type": "git",