@browserbasehq/orca 3.0.0-preview.4 → 3.0.0-preview.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/LICENSE +21 -0
  2. package/dist/index.js +113 -34
  3. package/package.json +13 -14
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Browserbase Inc.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/dist/index.js CHANGED
@@ -1642,13 +1642,10 @@ function decorateRoles(nodes, opts) {
1642
1642
  }
1643
1643
  function buildHierarchicalTree(nodes, opts) {
1644
1644
  return __async(this, null, function* () {
1645
- var _a2, _b, _c;
1645
+ var _a2;
1646
1646
  const nodeMap = /* @__PURE__ */ new Map();
1647
1647
  for (const n of nodes) {
1648
- const hasName = Boolean((_a2 = n.name) == null ? void 0 : _a2.trim());
1649
- const hasDescription = Boolean((_b = n.description) == null ? void 0 : _b.trim());
1650
- const hasChildren = !!(n.childIds && n.childIds.length);
1651
- const keep = hasName || hasDescription || hasChildren || !isStructural(n.role);
1648
+ const keep = !!(n.name && n.name.trim()) || !!(n.childIds && n.childIds.length) || !isStructural(n.role);
1652
1649
  if (!keep) continue;
1653
1650
  nodeMap.set(n.nodeId, __spreadValues({}, n));
1654
1651
  }
@@ -1656,7 +1653,7 @@ function buildHierarchicalTree(nodes, opts) {
1656
1653
  if (!n.parentId) continue;
1657
1654
  const parent = nodeMap.get(n.parentId);
1658
1655
  const cur = nodeMap.get(n.nodeId);
1659
- if (parent && cur) ((_c = parent.children) != null ? _c : parent.children = []).push(cur);
1656
+ if (parent && cur) ((_a2 = parent.children) != null ? _a2 : parent.children = []).push(cur);
1660
1657
  }
1661
1658
  const roots = nodes.filter((n) => !n.parentId && nodeMap.has(n.nodeId)).map((n) => nodeMap.get(n.nodeId));
1662
1659
  const cleaned = (yield Promise.all(roots.map(pruneStructuralSafe))).filter(
@@ -1665,17 +1662,15 @@ function buildHierarchicalTree(nodes, opts) {
1665
1662
  return { tree: cleaned };
1666
1663
  function pruneStructuralSafe(node) {
1667
1664
  return __async(this, null, function* () {
1668
- var _a3, _b2;
1665
+ var _a3;
1669
1666
  if (+node.nodeId < 0) return null;
1670
- const hasDescription = Boolean((_a3 = node.description) == null ? void 0 : _a3.trim());
1671
- const structuralRole = isStructural(node.role) && !hasDescription;
1672
- const children = (_b2 = node.children) != null ? _b2 : [];
1667
+ const children = (_a3 = node.children) != null ? _a3 : [];
1673
1668
  if (!children.length) {
1674
- return structuralRole ? null : node;
1669
+ return isStructural(node.role) ? null : node;
1675
1670
  }
1676
1671
  const cleanedKids = (yield Promise.all(children.map(pruneStructuralSafe))).filter(Boolean);
1677
1672
  const prunedStatic = removeRedundantStaticTextChildren(node, cleanedKids);
1678
- if (structuralRole) {
1673
+ if (isStructural(node.role)) {
1679
1674
  if (prunedStatic.length === 1) return prunedStatic[0];
1680
1675
  if (prunedStatic.length === 0) return null;
1681
1676
  }
@@ -6501,7 +6496,7 @@ var import_path5 = __toESM(require("path"));
6501
6496
  var import_process2 = __toESM(require("process"));
6502
6497
 
6503
6498
  // lib/version.ts
6504
- var STAGEHAND_VERSION = "3.0.0-preview.4";
6499
+ var STAGEHAND_VERSION = "3.0.0-preview.6";
6505
6500
 
6506
6501
  // lib/v3/types/public/sdkErrors.ts
6507
6502
  var StagehandError = class extends Error {
@@ -7896,7 +7891,7 @@ Return the element that matches the instruction if it exists. Otherwise, return
7896
7891
  }
7897
7892
  function buildActPrompt(action, supportedActions, variables) {
7898
7893
  let instruction = `Find the most relevant element to perform an action on given the following action: ${action}.
7899
- If the action implies choosing/selecting/clicking an option from a dropdown, ignore the 'General Instructions' section, and follow the 'Dropdown Specific Instructions' section carefully.
7894
+ IF AND ONLY IF the action EXPLICITLY includes the word 'dropdown' and implies choosing/selecting an option from a dropdown, ignore the 'General Instructions' section, and follow the 'Dropdown Specific Instructions' section carefully.
7900
7895
 
7901
7896
  General Instructions:
7902
7897
  Provide an action for this element such as ${supportedActions.join(", ")}. Remember that to users, buttons and links look the same in most cases.
@@ -8305,9 +8300,7 @@ function act(_0) {
8305
8300
  "the arguments to pass to the method. For example, for a click, the arguments are empty, but for a fill, the arguments are the value to fill in."
8306
8301
  )
8307
8302
  ),
8308
- twoStep: import_v32.z.boolean().describe(
8309
- "true if we will need to take another action after this. false otherwise"
8310
- )
8303
+ twoStep: import_v32.z.boolean()
8311
8304
  });
8312
8305
  const messages = [
8313
8306
  buildActSystemPrompt(userProvidedInstructions),
@@ -8437,7 +8430,6 @@ function normalizeRootXPath(input) {
8437
8430
  function performUnderstudyMethod(page, frame, method, rawXPath, args, domSettleTimeoutMs) {
8438
8431
  return __async(this, null, function* () {
8439
8432
  var _a2;
8440
- yield waitForDomNetworkQuiet(frame, domSettleTimeoutMs);
8441
8433
  const selectorRaw = normalizeRootXPath(rawXPath);
8442
8434
  const locator = yield resolveLocatorWithHops(
8443
8435
  page,
@@ -9056,6 +9048,7 @@ var ActHandler = class {
9056
9048
  const llmClient = this.resolveLlmClient(model);
9057
9049
  const doObserveAndAct = () => __async(this, null, function* () {
9058
9050
  var _a2, _b, _c, _d, _e, _f, _g, _h, _i, _j;
9051
+ yield waitForDomNetworkQuiet(page.mainFrame(), this.defaultDomSettleTimeoutMs);
9059
9052
  const snapshot = yield captureHybridSnapshot(page, {
9060
9053
  experimental: true
9061
9054
  });
@@ -9689,7 +9682,8 @@ var createActTool = (v3, executionModel) => (0, import_ai2.tool)({
9689
9682
  });
9690
9683
  return {
9691
9684
  success: (_b = result.success) != null ? _b : true,
9692
- action: (_c = result == null ? void 0 : result.actionDescription) != null ? _c : action
9685
+ action: (_c = result == null ? void 0 : result.actionDescription) != null ? _c : action,
9686
+ playwrightArguments: actions.length > 0 ? actions[0] : void 0
9693
9687
  };
9694
9688
  } catch (error) {
9695
9689
  return { success: false, error: (_d = error == null ? void 0 : error.message) != null ? _d : String(error) };
@@ -9867,7 +9861,11 @@ For any form with 2+ inputs/textareas. Faster than individual typing.`,
9867
9861
  observeResults,
9868
9862
  actions: replayableActions
9869
9863
  });
9870
- return { success: true, actions: completed };
9864
+ return {
9865
+ success: true,
9866
+ actions: completed,
9867
+ playwrightArguments: replayableActions
9868
+ };
9871
9869
  })
9872
9870
  });
9873
9871
 
@@ -10088,6 +10086,69 @@ function calculateCompressionStats(originalSize, compressedSize, screenshotCount
10088
10086
  };
10089
10087
  }
10090
10088
 
10089
+ // lib/v3/agent/utils/actionMapping.ts
10090
+ function mapToolResultToActions({
10091
+ toolCallName,
10092
+ toolResult,
10093
+ args,
10094
+ reasoning
10095
+ }) {
10096
+ switch (toolCallName) {
10097
+ case "act":
10098
+ return mapActToolResult(toolResult, args, reasoning);
10099
+ case "fillForm":
10100
+ return mapFillFormToolResult(toolResult, args, reasoning);
10101
+ default:
10102
+ return [createStandardAction(toolCallName, args, reasoning)];
10103
+ }
10104
+ }
10105
+ function mapActToolResult(toolResult, args, reasoning) {
10106
+ if (!toolResult || typeof toolResult !== "object") {
10107
+ return [createStandardAction("act", args, reasoning)];
10108
+ }
10109
+ const result = toolResult;
10110
+ const output = result.output || result;
10111
+ const action = __spreadValues({
10112
+ type: "act",
10113
+ reasoning,
10114
+ taskCompleted: false
10115
+ }, args);
10116
+ if (output.playwrightArguments) {
10117
+ action.playwrightArguments = output.playwrightArguments;
10118
+ }
10119
+ return [action];
10120
+ }
10121
+ function mapFillFormToolResult(toolResult, args, reasoning) {
10122
+ if (!toolResult || typeof toolResult !== "object") {
10123
+ return [createStandardAction("fillForm", args, reasoning)];
10124
+ }
10125
+ const result = toolResult;
10126
+ const output = result.output || result;
10127
+ const observeResults = Array.isArray(output == null ? void 0 : output.playwrightArguments) ? output.playwrightArguments : [];
10128
+ const actions = [];
10129
+ actions.push(__spreadValues({
10130
+ type: "fillForm",
10131
+ reasoning,
10132
+ taskCompleted: false
10133
+ }, args));
10134
+ for (const observeResult of observeResults) {
10135
+ actions.push({
10136
+ type: "act",
10137
+ reasoning: "acting from fillform tool",
10138
+ taskCompleted: false,
10139
+ playwrightArguments: observeResult
10140
+ });
10141
+ }
10142
+ return actions;
10143
+ }
10144
+ function createStandardAction(toolCallName, args, reasoning) {
10145
+ return __spreadValues({
10146
+ type: toolCallName,
10147
+ reasoning,
10148
+ taskCompleted: toolCallName === "close" ? args == null ? void 0 : args.taskComplete : false
10149
+ }, args);
10150
+ }
10151
+
10091
10152
  // lib/v3/handlers/v3AgentHandler.ts
10092
10153
  var V3AgentHandler = class {
10093
10154
  constructor(v3, logger, llmClient, executionModel, systemInstructions, mcpTools) {
@@ -10108,6 +10169,7 @@ var V3AgentHandler = class {
10108
10169
  let finalMessage = "";
10109
10170
  let completed = false;
10110
10171
  const collectedReasoning = [];
10172
+ let currentPageUrl = (yield this.v3.context.awaitActivePage()).url();
10111
10173
  try {
10112
10174
  const systemPrompt = this.buildSystemPrompt(
10113
10175
  options.instruction,
@@ -10142,14 +10204,17 @@ var V3AgentHandler = class {
10142
10204
  temperature: 1,
10143
10205
  toolChoice: "auto",
10144
10206
  onStepFinish: (event) => __async(this, null, function* () {
10207
+ var _a3;
10145
10208
  this.logger({
10146
10209
  category: "agent",
10147
10210
  message: `Step finished: ${event.finishReason}`,
10148
10211
  level: 2
10149
10212
  });
10150
10213
  if (event.toolCalls && event.toolCalls.length > 0) {
10151
- for (const toolCall of event.toolCalls) {
10214
+ for (let i = 0; i < event.toolCalls.length; i++) {
10215
+ const toolCall = event.toolCalls[i];
10152
10216
  const args = toolCall.input;
10217
+ const toolResult = (_a3 = event.toolResults) == null ? void 0 : _a3[i];
10153
10218
  if (event.text.length > 0) {
10154
10219
  collectedReasoning.push(event.text);
10155
10220
  this.logger({
@@ -10166,13 +10231,19 @@ var V3AgentHandler = class {
10166
10231
  finalMessage = closeReasoning ? `${allReasoning} ${closeReasoning}`.trim() : allReasoning || "Task completed successfully";
10167
10232
  }
10168
10233
  }
10169
- const action = __spreadValues({
10170
- type: toolCall.toolName,
10171
- reasoning: event.text || void 0,
10172
- taskCompleted: toolCall.toolName === "close" ? args == null ? void 0 : args.taskComplete : false
10173
- }, args);
10174
- actions.push(action);
10234
+ const mappedActions = mapToolResultToActions({
10235
+ toolCallName: toolCall.toolName,
10236
+ toolResult,
10237
+ args,
10238
+ reasoning: event.text || void 0
10239
+ });
10240
+ for (const action of mappedActions) {
10241
+ action.pageUrl = currentPageUrl;
10242
+ action.timestamp = Date.now();
10243
+ actions.push(action);
10244
+ }
10175
10245
  }
10246
+ currentPageUrl = (yield this.v3.context.awaitActivePage()).url();
10176
10247
  }
10177
10248
  })
10178
10249
  });
@@ -11856,6 +11927,7 @@ var GoogleCUAClient = class extends AgentClient {
11856
11927
  level: 2
11857
11928
  });
11858
11929
  if (action.type === "function" && action.name === "open_web_browser") {
11930
+ action.pageUrl = this.currentUrl;
11859
11931
  logger({
11860
11932
  category: "agent",
11861
11933
  message: "Skipping open_web_browser action",
@@ -12063,7 +12135,8 @@ var GoogleCUAClient = class extends AgentClient {
12063
12135
  return {
12064
12136
  type: "function",
12065
12137
  name: "open_web_browser",
12066
- arguments: null
12138
+ arguments: null,
12139
+ timestamp: Date.now()
12067
12140
  };
12068
12141
  case "click_at": {
12069
12142
  const { x, y } = this.normalizeCoordinates(
@@ -12211,6 +12284,9 @@ var GoogleCUAClient = class extends AgentClient {
12211
12284
  }
12212
12285
  captureScreenshot(options) {
12213
12286
  return __async(this, null, function* () {
12287
+ if (options == null ? void 0 : options.currentUrl) {
12288
+ this.currentUrl = options.currentUrl;
12289
+ }
12214
12290
  if (options == null ? void 0 : options.base64Image) {
12215
12291
  return `data:image/png;base64,${options.base64Image}`;
12216
12292
  }
@@ -12332,6 +12408,7 @@ var V3CuaAgentHandler = class {
12332
12408
  }));
12333
12409
  this.agentClient.setActionHandler((action) => __async(this, null, function* () {
12334
12410
  var _a2, _b, _c;
12411
+ action.pageUrl = (yield this.v3.context.awaitActivePage()).url();
12335
12412
  const defaultDelay = 1e3;
12336
12413
  const waitBetween = ((_a2 = this.options.clientOptions) == null ? void 0 : _a2.waitBetweenActions) || defaultDelay;
12337
12414
  try {
@@ -12343,6 +12420,7 @@ var V3CuaAgentHandler = class {
12343
12420
  }
12344
12421
  yield new Promise((r) => setTimeout(r, 300));
12345
12422
  yield this.executeAction(action);
12423
+ action.timestamp = Date.now();
12346
12424
  yield new Promise((r) => setTimeout(r, waitBetween));
12347
12425
  try {
12348
12426
  yield this.captureAndSendScreenshot();
@@ -38677,13 +38755,13 @@ var _V3 = class _V3 {
38677
38755
  level: 1
38678
38756
  });
38679
38757
  if (options == null ? void 0 : options.cua) {
38680
- if (!(options == null ? void 0 : options.model)) {
38681
- throw new Error("A CUA agent requires a model to be specified.");
38682
- }
38683
- const { modelName, isCua, clientOptions } = resolveModel(options.model);
38758
+ const modelToUse = (options == null ? void 0 : options.model) || __spreadValues({
38759
+ modelName: this.modelName
38760
+ }, this.modelClientOptions);
38761
+ const { modelName, isCua, clientOptions } = resolveModel(modelToUse);
38684
38762
  if (!isCua) {
38685
38763
  throw new Error(
38686
- "Model is not a CUA model. Try one of the following: " + AVAILABLE_CUA_MODELS.join(", ")
38764
+ "To use the computer use agent, please provide a CUA model in the agent constructor or stagehand config. Try one of our supported CUA models: " + AVAILABLE_CUA_MODELS.join(", ")
38687
38765
  );
38688
38766
  }
38689
38767
  const agentConfigSignature2 = this.agentCache.buildConfigSignature(options);
@@ -38770,10 +38848,11 @@ Do not ask follow up questions, the user will trust your judgement.`
38770
38848
  );
38771
38849
  }
38772
38850
  const tools = (options == null ? void 0 : options.integrations) ? yield resolveTools(options.integrations, options.tools) : (_a2 = options == null ? void 0 : options.tools) != null ? _a2 : {};
38851
+ const agentLlmClient = (options == null ? void 0 : options.model) ? this.resolveLlmClient(options.model) : this.llmClient;
38773
38852
  const handler = new V3AgentHandler(
38774
38853
  this,
38775
38854
  this.logger,
38776
- this.llmClient,
38855
+ agentLlmClient,
38777
38856
  typeof (options == null ? void 0 : options.executionModel) === "string" ? options.executionModel : (_b = options == null ? void 0 : options.executionModel) == null ? void 0 : _b.modelName,
38778
38857
  options == null ? void 0 : options.systemPrompt,
38779
38858
  tools
package/package.json CHANGED
@@ -1,21 +1,10 @@
1
1
  {
2
2
  "name": "@browserbasehq/orca",
3
- "version": "3.0.0-preview.4",
3
+ "version": "3.0.0-preview.6",
4
4
  "description": "An AI web browsing framework focused on simplicity and extensibility.",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.js",
7
7
  "types": "./dist/index.d.ts",
8
- "scripts": {
9
- "gen-version": "tsx scripts/gen-version.ts",
10
- "build-dom-scripts": "tsx lib/v3/dom/genDomScripts.ts && tsx lib/v3/dom/genLocatorScripts.ts",
11
- "build-js": "tsup --entry.index lib/v3/index.ts --dts",
12
- "typecheck": "tsc --noEmit",
13
- "prepare": "pnpm run build",
14
- "build": "pnpm run gen-version && pnpm run build-dom-scripts && pnpm run build-js && pnpm run typecheck",
15
- "example": "node --import tsx -e \"const args=process.argv.slice(1).filter(a=>a!=='--'); const [p]=args; const n=(p||'example').replace(/^\\.\\//,'').replace(/\\.ts$/i,''); import(new URL(require('node:path').resolve('examples', n + '.ts'), 'file:'));\" --",
16
- "lint": "cd ../.. && prettier --check packages/core && cd packages/core && eslint .",
17
- "format": "prettier --write ."
18
- },
19
8
  "files": [
20
9
  "dist/index.js",
21
10
  "dist/index.d.ts",
@@ -89,5 +78,15 @@
89
78
  "bugs": {
90
79
  "url": "https://github.com/browserbase/stagehand/issues"
91
80
  },
92
- "homepage": "https://stagehand.dev"
93
- }
81
+ "homepage": "https://stagehand.dev",
82
+ "scripts": {
83
+ "gen-version": "tsx scripts/gen-version.ts",
84
+ "build-dom-scripts": "tsx lib/v3/dom/genDomScripts.ts && tsx lib/v3/dom/genLocatorScripts.ts",
85
+ "build-js": "tsup --entry.index lib/v3/index.ts --dts",
86
+ "typecheck": "tsc --noEmit",
87
+ "build": "pnpm run gen-version && pnpm run build-dom-scripts && pnpm run build-js && pnpm run typecheck",
88
+ "example": "node --import tsx -e \"const args=process.argv.slice(1).filter(a=>a!=='--'); const [p]=args; const n=(p||'example').replace(/^\\.\\//,'').replace(/\\.ts$/i,''); import(new URL(require('node:path').resolve('examples', n + '.ts'), 'file:'));\" --",
89
+ "lint": "cd ../.. && prettier --check packages/core && cd packages/core && eslint .",
90
+ "format": "prettier --write ."
91
+ }
92
+ }