@browserbasehq/stagehand 3.0.8-alpha-091296e438bb2374c8bb10ef6c08283978145ebf → 3.0.8-alpha-16d72fb4c4081dd33bf45605d75c27644ea4c00e

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -3403,6 +3403,63 @@ interface StreamingAgentInstance {
3403
3403
  interface NonStreamingAgentInstance {
3404
3404
  execute: (instructionOrOptions: string | AgentExecuteOptions) => Promise<AgentResult>;
3405
3405
  }
3406
+ /**
3407
+ * Content item type for toModelOutput return values.
3408
+ * Used in tool definitions to return text and/or media to the model.
3409
+ */
3410
+ type ModelOutputContentItem = {
3411
+ type: "text";
3412
+ text: string;
3413
+ } | {
3414
+ type: "media";
3415
+ mediaType: string;
3416
+ data: string;
3417
+ };
3418
+ interface ClickToolResult {
3419
+ success: boolean;
3420
+ describe?: string;
3421
+ coordinates?: number[];
3422
+ error?: string;
3423
+ screenshotBase64?: string;
3424
+ }
3425
+ interface TypeToolResult {
3426
+ success: boolean;
3427
+ describe?: string;
3428
+ text?: string;
3429
+ error?: string;
3430
+ screenshotBase64?: string;
3431
+ }
3432
+ interface DragAndDropToolResult {
3433
+ success: boolean;
3434
+ describe?: string;
3435
+ error?: string;
3436
+ screenshotBase64?: string;
3437
+ }
3438
+ interface FillFormField {
3439
+ action: string;
3440
+ value: string;
3441
+ coordinates: {
3442
+ x: number;
3443
+ y: number;
3444
+ };
3445
+ }
3446
+ interface FillFormVisionToolResult {
3447
+ success: boolean;
3448
+ playwrightArguments?: FillFormField[];
3449
+ error?: string;
3450
+ screenshotBase64?: string;
3451
+ }
3452
+ interface ScrollVisionToolResult {
3453
+ success: boolean;
3454
+ message: string;
3455
+ scrolledPixels: number;
3456
+ screenshotBase64?: string;
3457
+ }
3458
+ interface WaitToolResult {
3459
+ success: boolean;
3460
+ waited: number;
3461
+ screenshotBase64?: string;
3462
+ }
3406
3463
 
3407
3464
  type OpenAIClientOptions = Pick<ClientOptions$1, "baseURL" | "apiKey">;
3408
3465
  type AnthropicClientOptions = Pick<ClientOptions$2, "baseURL" | "apiKey">;
@@ -3815,12 +3872,9 @@ declare const screenshotTool: (v3: V3) => ai.Tool<Record<string, never>, {
3815
3872
  pageUrl: string;
3816
3873
  }>;
3817
3874
 
3818
- declare const waitTool: (v3: V3) => ai.Tool<{
3875
+ declare const waitTool: (v3: V3, mode?: AgentToolMode) => ai.Tool<{
3819
3876
  timeMs: number;
3820
- }, {
3821
- success: boolean;
3822
- waited: number;
3823
- }>;
3877
+ }, WaitToolResult>;
3824
3878
 
3825
3879
  declare const navBackTool: (v3: V3) => ai.Tool<{
3826
3880
  reasoningText: string;
@@ -3873,11 +3927,7 @@ declare const scrollVisionTool: (v3: V3, provider?: string) => ai.Tool<{
3873
3927
  direction: "up" | "down";
3874
3928
  coordinates?: number[];
3875
3929
  percentage?: number;
3876
- }, {
3877
- success: boolean;
3878
- message: string;
3879
- scrolledPixels: number;
3880
- }>;
3930
+ }, ScrollVisionToolResult>;
3881
3931
 
3882
3932
  declare const extractTool: (v3: V3, executionModel?: string, logger?: (message: LogLine) => void) => ai.Tool<{
3883
3933
  instruction: string;
@@ -3895,47 +3945,19 @@ declare const extractTool: (v3: V3, executionModel?: string, logger?: (message:
3895
3945
  declare const clickTool: (v3: V3, provider?: string) => ai.Tool<{
3896
3946
  describe: string;
3897
3947
  coordinates: number[];
3898
- }, {
3899
- success: boolean;
3900
- describe: string;
3901
- coordinates: number[];
3902
- error?: undefined;
3903
- } | {
3904
- success: boolean;
3905
- error: string;
3906
- describe?: undefined;
3907
- coordinates?: undefined;
3908
- }>;
3948
+ }, ClickToolResult>;
3909
3949
 
3910
3950
  declare const typeTool: (v3: V3, provider?: string) => ai.Tool<{
3911
3951
  describe: string;
3912
3952
  text: string;
3913
3953
  coordinates: number[];
3914
- }, {
3915
- success: boolean;
3916
- describe: string;
3917
- text: string;
3918
- error?: undefined;
3919
- } | {
3920
- success: boolean;
3921
- error: string;
3922
- describe?: undefined;
3923
- text?: undefined;
3924
- }>;
3954
+ }, TypeToolResult>;
3925
3955
 
3926
3956
  declare const dragAndDropTool: (v3: V3, provider?: string) => ai.Tool<{
3927
3957
  describe: string;
3928
3958
  startCoordinates: number[];
3929
3959
  endCoordinates: number[];
3930
- }, {
3931
- success: boolean;
3932
- describe: string;
3933
- error?: undefined;
3934
- } | {
3935
- success: boolean;
3936
- error: string;
3937
- describe?: undefined;
3938
- }>;
3960
+ }, DragAndDropToolResult>;
3939
3961
 
3940
3962
  declare const clickAndHoldTool: (v3: V3, provider?: string) => ai.Tool<{
3941
3963
  describe: string;
@@ -3984,22 +4006,7 @@ declare const fillFormVisionTool: (v3: V3, provider?: string) => ai.Tool<{
3984
4006
  y: number;
3985
4007
  };
3986
4008
  }[];
3987
- }, {
3988
- success: boolean;
3989
- playwrightArguments: {
3990
- coordinates: {
3991
- x: number;
3992
- y: number;
3993
- };
3994
- action: string;
3995
- value: string;
3996
- }[];
3997
- error?: undefined;
3998
- } | {
3999
- success: boolean;
4000
- error: string;
4001
- playwrightArguments?: undefined;
4002
- }>;
4009
+ }, FillFormVisionToolResult>;
4003
4010
 
4004
4011
  declare const thinkTool: () => ai.Tool<{
4005
4012
  reasoning: string;
@@ -4187,4 +4194,4 @@ declare class V3Evaluator {
4187
4194
  private _evaluateWithMultipleScreenshots;
4188
4195
  }
4189
4196
 
4190
- export { type AISDKCustomProvider, type AISDKProvider, AISdkClient, AVAILABLE_CUA_MODELS, type ActOptions, type ActResult, ActTimeoutError, type Action, type ActionExecutionResult, AgentAbortError, type AgentAction, type AgentCallbacks, type AgentConfig, type AgentContext, type AgentExecuteCallbacks, type AgentExecuteOptions, type AgentExecuteOptionsBase, type AgentExecutionOptions, type AgentHandlerOptions, type AgentInstance, type AgentModelConfig, AgentProvider, type AgentProviderType, type AgentResult, AgentScreenshotProviderError, type AgentState, type AgentStreamCallbacks, type AgentStreamExecuteOptions, type AgentStreamResult, type AgentToolCall, type AgentToolMode, type AgentToolResult, type AgentToolTypesMap, type AgentTools, type AgentType, type AgentUITools, AnnotatedScreenshotText, type AnthropicClientOptions, type AnthropicContentBlock, type AnthropicJsonSchemaObject, type AnthropicMessage, type AnthropicTextBlock, type AnthropicToolResult, type AnyPage, api as Api, type AvailableCuaModel, type AvailableModel, BrowserbaseSessionNotFoundError, CaptchaTimeoutError, type ChatCompletionOptions, type ChatMessage, type ChatMessageContent, type ChatMessageImageContent, type ChatMessageTextContent, type ClientOptions, type ComputerCallItem, ConnectionTimeoutError, type ConsoleListener, ConsoleMessage, ContentFrameNotFoundError, type CreateChatCompletionOptions, CreateChatCompletionResponseError, CuaModelRequiredError, ElementNotVisibleError, ExperimentalApiConflictError, ExperimentalNotConfiguredError, type ExtractOptions, type ExtractResult, ExtractTimeoutError, type FunctionCallItem, type GoogleServiceAccountCredentials, type GoogleVertexProviderSettings, HandlerNotInitializedError, type HistoryEntry, type InferStagehandSchema, InvalidAISDKModelFormatError, type JsonSchema, type JsonSchemaDocument, type JsonSchemaProperty, LLMClient, type LLMParsedResponse, type LLMResponse, LLMResponseError, type LLMTool, type LLMUsage, LOG_LEVEL_NAMES, type LoadState, type LocalBrowserLaunchOptions, type LogLevel, type LogLine, type Logger, MCPConnectionError, MissingEnvironmentVariableError, MissingLLMConfigurationError, type ModelConfiguration, type ModelProvider, type NonStreamingAgentInstance, type ObserveOptions, ObserveTimeoutError, type OpenAIClientOptions, Page, PageNotFoundError, Response$1 as Response, ResponseBodyError, type ResponseInputItem, type ResponseItem, ResponseParseError, type SafetyCheck, type SafetyConfirmationHandler, type SafetyConfirmationResponse, V3 as Stagehand, StagehandAPIError, StagehandAPIUnauthorizedError, StagehandClickError, StagehandClosedError, StagehandDefaultError, StagehandDomProcessError, StagehandElementNotFoundError, StagehandEnvironmentError, StagehandError, StagehandEvalError, StagehandHttpError, StagehandIframeError, StagehandInitError, StagehandInvalidArgumentError, type StagehandMetrics, StagehandMissingArgumentError, StagehandNotInitializedError, StagehandResponseBodyError, StagehandResponseParseError, StagehandServerError, StagehandShadowRootMissingError, StagehandShadowSegmentEmptyError, StagehandShadowSegmentNotFoundError, type StagehandZodObject, type StagehandZodSchema, type StreamingAgentInstance, StreamingCallbacksInNonStreamingModeError, TimeoutError, type ToolUseItem, UnsupportedAISDKModelProviderError, UnsupportedModelError, UnsupportedModelProviderError, V3, type V3Env, V3Evaluator, V3FunctionName, type V3Options, XPathResolutionError, ZodSchemaValidationError, connectToMCPServer, defaultExtractSchema, getZodType, injectUrls, isRunningInBun, isZod3Schema, isZod4Schema, jsonSchemaToZod, loadApiKeyFromEnv, localBrowserLaunchOptionsSchema, modelToAgentProviderMap, pageTextSchema, providerEnvVarMap, toGeminiSchema, toJsonSchema, transformSchema, trimTrailingTextNode, validateZodSchema };
4197
+ export { type AISDKCustomProvider, type AISDKProvider, AISdkClient, AVAILABLE_CUA_MODELS, type ActOptions, type ActResult, ActTimeoutError, type Action, type ActionExecutionResult, AgentAbortError, type AgentAction, type AgentCallbacks, type AgentConfig, type AgentContext, type AgentExecuteCallbacks, type AgentExecuteOptions, type AgentExecuteOptionsBase, type AgentExecutionOptions, type AgentHandlerOptions, type AgentInstance, type AgentModelConfig, AgentProvider, type AgentProviderType, type AgentResult, AgentScreenshotProviderError, type AgentState, type AgentStreamCallbacks, type AgentStreamExecuteOptions, type AgentStreamResult, type AgentToolCall, type AgentToolMode, type AgentToolResult, type AgentToolTypesMap, type AgentTools, type AgentType, type AgentUITools, AnnotatedScreenshotText, type AnthropicClientOptions, type AnthropicContentBlock, type AnthropicJsonSchemaObject, type AnthropicMessage, type AnthropicTextBlock, type AnthropicToolResult, type AnyPage, api as Api, type AvailableCuaModel, type AvailableModel, BrowserbaseSessionNotFoundError, CaptchaTimeoutError, type ChatCompletionOptions, type ChatMessage, type ChatMessageContent, type ChatMessageImageContent, type ChatMessageTextContent, type ClickToolResult, type ClientOptions, type ComputerCallItem, ConnectionTimeoutError, type ConsoleListener, ConsoleMessage, ContentFrameNotFoundError, type CreateChatCompletionOptions, CreateChatCompletionResponseError, CuaModelRequiredError, type DragAndDropToolResult, ElementNotVisibleError, ExperimentalApiConflictError, ExperimentalNotConfiguredError, type ExtractOptions, type ExtractResult, ExtractTimeoutError, type FillFormField, type FillFormVisionToolResult, type FunctionCallItem, type GoogleServiceAccountCredentials, type GoogleVertexProviderSettings, HandlerNotInitializedError, type HistoryEntry, type InferStagehandSchema, InvalidAISDKModelFormatError, type JsonSchema, type JsonSchemaDocument, type JsonSchemaProperty, LLMClient, type LLMParsedResponse, type LLMResponse, LLMResponseError, type LLMTool, type LLMUsage, LOG_LEVEL_NAMES, type LoadState, type LocalBrowserLaunchOptions, type LogLevel, type LogLine, type Logger, MCPConnectionError, MissingEnvironmentVariableError, MissingLLMConfigurationError, type ModelConfiguration, type ModelOutputContentItem, type ModelProvider, type NonStreamingAgentInstance, type ObserveOptions, ObserveTimeoutError, type OpenAIClientOptions, Page, PageNotFoundError, Response$1 as Response, ResponseBodyError, type ResponseInputItem, type ResponseItem, ResponseParseError, type SafetyCheck, type SafetyConfirmationHandler, type SafetyConfirmationResponse, type ScrollVisionToolResult, V3 as Stagehand, StagehandAPIError, StagehandAPIUnauthorizedError, StagehandClickError, StagehandClosedError, StagehandDefaultError, StagehandDomProcessError, StagehandElementNotFoundError, StagehandEnvironmentError, StagehandError, StagehandEvalError, StagehandHttpError, StagehandIframeError, StagehandInitError, StagehandInvalidArgumentError, type StagehandMetrics, StagehandMissingArgumentError, StagehandNotInitializedError, StagehandResponseBodyError, StagehandResponseParseError, StagehandServerError, StagehandShadowRootMissingError, StagehandShadowSegmentEmptyError, StagehandShadowSegmentNotFoundError, type StagehandZodObject, type StagehandZodSchema, type StreamingAgentInstance, StreamingCallbacksInNonStreamingModeError, TimeoutError, type ToolUseItem, type TypeToolResult, UnsupportedAISDKModelProviderError, UnsupportedModelError, UnsupportedModelProviderError, V3, type V3Env, V3Evaluator, V3FunctionName, type V3Options, type WaitToolResult, XPathResolutionError, ZodSchemaValidationError, connectToMCPServer, defaultExtractSchema, getZodType, injectUrls, isRunningInBun, isZod3Schema, isZod4Schema, jsonSchemaToZod, loadApiKeyFromEnv, localBrowserLaunchOptionsSchema, modelToAgentProviderMap, pageTextSchema, providerEnvVarMap, toGeminiSchema, toJsonSchema, transformSchema, trimTrailingTextNode, validateZodSchema };
package/dist/index.js CHANGED
@@ -31125,7 +31125,25 @@ var screenshotTool = (v3) => (0, import_ai3.tool)({
31125
31125
  // lib/v3/agent/tools/wait.ts
31126
31126
  var import_ai4 = require("ai");
31127
31127
  var import_zod8 = require("zod");
31128
- var waitTool = (v3) => (0, import_ai4.tool)({
31128
+
31129
+ // lib/v3/agent/utils/screenshotHandler.ts
31130
+ var DEFAULT_DELAY_MS = 500;
31131
+ function waitAndCaptureScreenshot(_0) {
31132
+ return __async(this, arguments, function* (page, delayMs = DEFAULT_DELAY_MS) {
31133
+ if (delayMs > 0) {
31134
+ yield page.waitForTimeout(delayMs);
31135
+ }
31136
+ try {
31137
+ const buffer = yield page.screenshot({ fullPage: false });
31138
+ return buffer.toString("base64");
31139
+ } catch (e2) {
31140
+ return void 0;
31141
+ }
31142
+ });
31143
+ }
31144
+
31145
+ // lib/v3/agent/tools/wait.ts
31146
+ var waitTool = (v3, mode) => (0, import_ai4.tool)({
31129
31147
  description: "Wait for a specified time",
31130
31148
  inputSchema: import_zod8.z.object({
31131
31149
  timeMs: import_zod8.z.number().describe("Time in milliseconds")
@@ -31146,8 +31164,32 @@ var waitTool = (v3) => (0, import_ai4.tool)({
31146
31164
  if (timeMs > 0) {
31147
31165
  v3.recordAgentReplayStep({ type: "wait", timeMs });
31148
31166
  }
31167
+ if (mode === "hybrid") {
31168
+ const page = yield v3.context.awaitActivePage();
31169
+ const screenshotBase64 = yield waitAndCaptureScreenshot(page, 0);
31170
+ return { success: true, waited: timeMs, screenshotBase64 };
31171
+ }
31149
31172
  return { success: true, waited: timeMs };
31150
- })
31173
+ }),
31174
+ toModelOutput: (result) => {
31175
+ const content = [
31176
+ {
31177
+ type: "text",
31178
+ text: JSON.stringify({
31179
+ success: result.success,
31180
+ waited: result.waited
31181
+ })
31182
+ }
31183
+ ];
31184
+ if (result.screenshotBase64) {
31185
+ content.push({
31186
+ type: "media",
31187
+ mediaType: "image/png",
31188
+ data: result.screenshotBase64
31189
+ });
31190
+ }
31191
+ return { type: "content", value: content };
31192
+ }
31151
31193
  });
31152
31194
 
31153
31195
  // lib/v3/agent/tools/navback.ts
@@ -31346,7 +31388,11 @@ var scrollVisionTool = (v3, provider) => (0, import_ai9.tool)({
31346
31388
  ),
31347
31389
  percentage: import_zod13.z.number().min(1).max(200).optional()
31348
31390
  }),
31349
- execute: (_0) => __async(null, [_0], function* ({ direction, coordinates, percentage = 80 }) {
31391
+ execute: (_0) => __async(null, [_0], function* ({
31392
+ direction,
31393
+ coordinates,
31394
+ percentage = 80
31395
+ }) {
31350
31396
  const page = yield v3.context.awaitActivePage();
31351
31397
  const { w, h: h2 } = yield page.mainFrame().evaluate("({ w: window.innerWidth, h: window.innerHeight })");
31352
31398
  let cx;
@@ -31382,6 +31428,7 @@ var scrollVisionTool = (v3, provider) => (0, import_ai9.tool)({
31382
31428
  const scrollDistance = Math.round(h2 * percentage / 100);
31383
31429
  const deltaY = direction === "up" ? -scrollDistance : scrollDistance;
31384
31430
  yield page.scroll(cx, cy, 0, deltaY);
31431
+ const screenshotBase64 = yield waitAndCaptureScreenshot(page, 100);
31385
31432
  v3.recordAgentReplayStep({
31386
31433
  type: "scroll",
31387
31434
  deltaX: 0,
@@ -31391,9 +31438,30 @@ var scrollVisionTool = (v3, provider) => (0, import_ai9.tool)({
31391
31438
  return {
31392
31439
  success: true,
31393
31440
  message: coordinates ? `Scrolled ${percentage}% ${direction} at (${cx}, ${cy})` : `Scrolled ${percentage}% ${direction}`,
31394
- scrolledPixels: scrollDistance
31441
+ scrolledPixels: scrollDistance,
31442
+ screenshotBase64
31395
31443
  };
31396
- })
31444
+ }),
31445
+ toModelOutput: (result) => {
31446
+ const content = [
31447
+ {
31448
+ type: "text",
31449
+ text: JSON.stringify({
31450
+ success: result.success,
31451
+ message: result.message,
31452
+ scrolledPixels: result.scrolledPixels
31453
+ })
31454
+ }
31455
+ ];
31456
+ if (result.screenshotBase64) {
31457
+ content.push({
31458
+ type: "media",
31459
+ mediaType: "image/png",
31460
+ data: result.screenshotBase64
31461
+ });
31462
+ }
31463
+ return { type: "content", value: content };
31464
+ }
31397
31465
  });
31398
31466
 
31399
31467
  // lib/v3/agent/tools/extract.ts
@@ -31461,9 +31529,6 @@ function ensureXPath(value) {
31461
31529
  }
31462
31530
 
31463
31531
  // lib/v3/agent/tools/click.ts
31464
- function waitForTimeout(ms) {
31465
- return new Promise((resolve3) => setTimeout(resolve3, ms));
31466
- }
31467
31532
  var clickTool = (v3, provider) => (0, import_ai11.tool)({
31468
31533
  description: "Click on an element using its coordinates (this is the most reliable way to click on an element, always use this over act, unless the element is not visible in the screenshot, but shown in ariaTree)",
31469
31534
  inputSchema: import_zod15.z.object({
@@ -31495,9 +31560,7 @@ var clickTool = (v3, provider) => (0, import_ai11.tool)({
31495
31560
  const xpath = yield page.click(processed.x, processed.y, {
31496
31561
  returnXpath: shouldCollectXpath
31497
31562
  });
31498
- if (isGoogleProvider(provider)) {
31499
- yield waitForTimeout(1e3);
31500
- }
31563
+ const screenshotBase64 = yield waitAndCaptureScreenshot(page);
31501
31564
  if (shouldCollectXpath) {
31502
31565
  const normalizedXpath = ensureXPath(xpath);
31503
31566
  if (normalizedXpath) {
@@ -31518,7 +31581,8 @@ var clickTool = (v3, provider) => (0, import_ai11.tool)({
31518
31581
  return {
31519
31582
  success: true,
31520
31583
  describe,
31521
- coordinates: [processed.x, processed.y]
31584
+ coordinates: [processed.x, processed.y],
31585
+ screenshotBase64
31522
31586
  };
31523
31587
  } catch (error) {
31524
31588
  return {
@@ -31526,15 +31590,46 @@ var clickTool = (v3, provider) => (0, import_ai11.tool)({
31526
31590
  error: `Error clicking: ${error.message}`
31527
31591
  };
31528
31592
  }
31529
- })
31593
+ }),
31594
+ toModelOutput: (result) => {
31595
+ if (result.success) {
31596
+ const content = [
31597
+ {
31598
+ type: "text",
31599
+ text: JSON.stringify({
31600
+ success: result.success,
31601
+ describe: result.describe,
31602
+ coordinates: result.coordinates
31603
+ })
31604
+ }
31605
+ ];
31606
+ if (result.screenshotBase64) {
31607
+ content.push({
31608
+ type: "media",
31609
+ mediaType: "image/png",
31610
+ data: result.screenshotBase64
31611
+ });
31612
+ }
31613
+ return { type: "content", value: content };
31614
+ }
31615
+ return {
31616
+ type: "content",
31617
+ value: [
31618
+ {
31619
+ type: "text",
31620
+ text: JSON.stringify({
31621
+ success: result.success,
31622
+ error: result.error
31623
+ })
31624
+ }
31625
+ ]
31626
+ };
31627
+ }
31530
31628
  });
31531
31629
 
31532
31630
  // lib/v3/agent/tools/type.ts
31533
31631
  var import_ai12 = require("ai");
31534
31632
  var import_zod16 = require("zod");
31535
- function waitForTimeout2(ms) {
31536
- return new Promise((resolve3) => setTimeout(resolve3, ms));
31537
- }
31538
31633
  var typeTool = (v3, provider) => (0, import_ai12.tool)({
31539
31634
  description: "Type text into an element using its coordinates. This will click the element and then type the text into it (this is the most reliable way to type into an element, always use this over act, unless the element is not visible in the screenshot, but shown in ariaTree)",
31540
31635
  inputSchema: import_zod16.z.object({
@@ -31544,7 +31639,11 @@ var typeTool = (v3, provider) => (0, import_ai12.tool)({
31544
31639
  text: import_zod16.z.string().describe("The text to type into the element"),
31545
31640
  coordinates: import_zod16.z.array(import_zod16.z.number()).describe("The (x, y) coordinates to type into the element")
31546
31641
  }),
31547
- execute: (_0) => __async(null, [_0], function* ({ describe, coordinates, text }) {
31642
+ execute: (_0) => __async(null, [_0], function* ({
31643
+ describe,
31644
+ coordinates,
31645
+ text
31646
+ }) {
31548
31647
  try {
31549
31648
  const page = yield v3.context.awaitActivePage();
31550
31649
  const processed = processCoordinates(
@@ -31567,10 +31666,8 @@ var typeTool = (v3, provider) => (0, import_ai12.tool)({
31567
31666
  const xpath = yield page.click(processed.x, processed.y, {
31568
31667
  returnXpath: shouldCollectXpath
31569
31668
  });
31570
- if (isGoogleProvider(provider)) {
31571
- yield waitForTimeout2(1e3);
31572
- }
31573
31669
  yield page.type(text);
31670
+ const screenshotBase64 = yield waitAndCaptureScreenshot(page);
31574
31671
  if (shouldCollectXpath) {
31575
31672
  const normalizedXpath = ensureXPath(xpath);
31576
31673
  if (normalizedXpath) {
@@ -31588,14 +31685,53 @@ var typeTool = (v3, provider) => (0, import_ai12.tool)({
31588
31685
  });
31589
31686
  }
31590
31687
  }
31591
- return { success: true, describe, text };
31688
+ return {
31689
+ success: true,
31690
+ describe,
31691
+ text,
31692
+ screenshotBase64
31693
+ };
31592
31694
  } catch (error) {
31593
31695
  return {
31594
31696
  success: false,
31595
31697
  error: `Error typing: ${error.message}`
31596
31698
  };
31597
31699
  }
31598
- })
31700
+ }),
31701
+ toModelOutput: (result) => {
31702
+ if (result.success) {
31703
+ const content = [
31704
+ {
31705
+ type: "text",
31706
+ text: JSON.stringify({
31707
+ success: result.success,
31708
+ describe: result.describe,
31709
+ text: result.text
31710
+ })
31711
+ }
31712
+ ];
31713
+ if (result.screenshotBase64) {
31714
+ content.push({
31715
+ type: "media",
31716
+ mediaType: "image/png",
31717
+ data: result.screenshotBase64
31718
+ });
31719
+ }
31720
+ return { type: "content", value: content };
31721
+ }
31722
+ return {
31723
+ type: "content",
31724
+ value: [
31725
+ {
31726
+ type: "text",
31727
+ text: JSON.stringify({
31728
+ success: result.success,
31729
+ error: result.error
31730
+ })
31731
+ }
31732
+ ]
31733
+ };
31734
+ }
31599
31735
  });
31600
31736
 
31601
31737
  // lib/v3/agent/tools/dragAndDrop.ts
@@ -31608,7 +31744,11 @@ var dragAndDropTool = (v3, provider) => (0, import_ai13.tool)({
31608
31744
  startCoordinates: import_zod17.z.array(import_zod17.z.number()).describe("The (x, y) coordinates to start the drag and drop from"),
31609
31745
  endCoordinates: import_zod17.z.array(import_zod17.z.number()).describe("The (x, y) coordinates to end the drag and drop at")
31610
31746
  }),
31611
- execute: (_0) => __async(null, [_0], function* ({ describe, startCoordinates, endCoordinates }) {
31747
+ execute: (_0) => __async(null, [_0], function* ({
31748
+ describe,
31749
+ startCoordinates,
31750
+ endCoordinates
31751
+ }) {
31612
31752
  try {
31613
31753
  const page = yield v3.context.awaitActivePage();
31614
31754
  const processedStart = processCoordinates(
@@ -31646,6 +31786,7 @@ var dragAndDropTool = (v3, provider) => (0, import_ai13.tool)({
31646
31786
  processedEnd.y,
31647
31787
  { returnXpath: shouldCollectXpath }
31648
31788
  );
31789
+ const screenshotBase64 = yield waitAndCaptureScreenshot(page);
31649
31790
  if (shouldCollectXpath) {
31650
31791
  const normalizedFrom = ensureXPath(fromXpath);
31651
31792
  const normalizedTo = ensureXPath(toXpath);
@@ -31664,14 +31805,51 @@ var dragAndDropTool = (v3, provider) => (0, import_ai13.tool)({
31664
31805
  });
31665
31806
  }
31666
31807
  }
31667
- return { success: true, describe };
31808
+ return {
31809
+ success: true,
31810
+ describe,
31811
+ screenshotBase64
31812
+ };
31668
31813
  } catch (error) {
31669
31814
  return {
31670
31815
  success: false,
31671
31816
  error: `Error dragging: ${error.message}`
31672
31817
  };
31673
31818
  }
31674
- })
31819
+ }),
31820
+ toModelOutput: (result) => {
31821
+ if (result.success) {
31822
+ const content = [
31823
+ {
31824
+ type: "text",
31825
+ text: JSON.stringify({
31826
+ success: result.success,
31827
+ describe: result.describe
31828
+ })
31829
+ }
31830
+ ];
31831
+ if (result.screenshotBase64) {
31832
+ content.push({
31833
+ type: "media",
31834
+ mediaType: "image/png",
31835
+ data: result.screenshotBase64
31836
+ });
31837
+ }
31838
+ return { type: "content", value: content };
31839
+ }
31840
+ return {
31841
+ type: "content",
31842
+ value: [
31843
+ {
31844
+ type: "text",
31845
+ text: JSON.stringify({
31846
+ success: result.success,
31847
+ error: result.error
31848
+ })
31849
+ }
31850
+ ]
31851
+ };
31852
+ }
31675
31853
  });
31676
31854
 
31677
31855
  // lib/v3/agent/tools/clickAndHold.ts
@@ -31888,6 +32066,7 @@ MANDATORY USE CASES (always use fillFormVision for these):
31888
32066
  }
31889
32067
  yield new Promise((resolve3) => setTimeout(resolve3, 100));
31890
32068
  }
32069
+ const screenshotBase64 = yield waitAndCaptureScreenshot(page, 100);
31891
32070
  if (shouldCollectXpath && actions.length > 0) {
31892
32071
  v3.recordAgentReplayStep({
31893
32072
  type: "act",
@@ -31898,7 +32077,8 @@ MANDATORY USE CASES (always use fillFormVision for these):
31898
32077
  }
31899
32078
  return {
31900
32079
  success: true,
31901
- playwrightArguments: processedFields
32080
+ playwrightArguments: processedFields,
32081
+ screenshotBase64
31902
32082
  };
31903
32083
  } catch (error) {
31904
32084
  return {
@@ -31906,7 +32086,41 @@ MANDATORY USE CASES (always use fillFormVision for these):
31906
32086
  error: `Error filling form: ${error.message}`
31907
32087
  };
31908
32088
  }
31909
- })
32089
+ }),
32090
+ toModelOutput: (result) => {
32091
+ var _a4, _b;
32092
+ if (result.success) {
32093
+ const content = [
32094
+ {
32095
+ type: "text",
32096
+ text: JSON.stringify({
32097
+ success: result.success,
32098
+ fieldsCount: (_b = (_a4 = result.playwrightArguments) == null ? void 0 : _a4.length) != null ? _b : 0
32099
+ })
32100
+ }
32101
+ ];
32102
+ if (result.screenshotBase64) {
32103
+ content.push({
32104
+ type: "media",
32105
+ mediaType: "image/png",
32106
+ data: result.screenshotBase64
32107
+ });
32108
+ }
32109
+ return { type: "content", value: content };
32110
+ }
32111
+ return {
32112
+ type: "content",
32113
+ value: [
32114
+ {
32115
+ type: "text",
32116
+ text: JSON.stringify({
32117
+ success: result.success,
32118
+ error: result.error
32119
+ })
32120
+ }
32121
+ ]
32122
+ };
32123
+ }
31910
32124
  });
31911
32125
 
31912
32126
  // lib/v3/agent/tools/think.ts
@@ -32055,7 +32269,7 @@ function createAgentTools(v3, options) {
32055
32269
  scroll: mode === "hybrid" ? scrollVisionTool(v3, provider) : scrollTool(v3),
32056
32270
  think: thinkTool(),
32057
32271
  type: typeTool(v3, provider),
32058
- wait: waitTool(v3)
32272
+ wait: waitTool(v3, mode)
32059
32273
  };
32060
32274
  if (process.env.BRAVE_API_KEY) {
32061
32275
  allTools.search = searchTool(v3);
@@ -32253,37 +32467,54 @@ function buildAgentSystemPrompt(options) {
32253
32467
  var import_ai19 = require("ai");
32254
32468
 
32255
32469
  // lib/v3/agent/utils/messageProcessing.ts
32470
+ var VISION_ACTION_TOOLS = [
32471
+ "click",
32472
+ "type",
32473
+ "dragAndDrop",
32474
+ "wait",
32475
+ "fillFormVision",
32476
+ "scroll"
32477
+ ];
32256
32478
  function isToolMessage(message) {
32257
32479
  return !!message && typeof message === "object" && message.role === "tool" && Array.isArray(message.content);
32258
32480
  }
32259
32481
  function isScreenshotPart(part) {
32260
32482
  return !!part && typeof part === "object" && part.toolName === "screenshot";
32261
32483
  }
32484
+ function isVisionActionPart(part) {
32485
+ if (!part || typeof part !== "object") return false;
32486
+ const toolName = part.toolName;
32487
+ return typeof toolName === "string" && VISION_ACTION_TOOLS.includes(toolName);
32488
+ }
32489
+ function isVisionPart(part) {
32490
+ return isScreenshotPart(part) || isVisionActionPart(part);
32491
+ }
32262
32492
  function isAriaTreePart(part) {
32263
32493
  return !!part && typeof part === "object" && part.toolName === "ariaTree";
32264
32494
  }
32265
32495
  function processMessages(messages) {
32266
32496
  let compressedCount = 0;
32267
- const screenshotIndices = [];
32497
+ const visionIndices = [];
32268
32498
  const ariaTreeIndices = [];
32269
32499
  for (let i2 = 0; i2 < messages.length; i2++) {
32270
32500
  const message = messages[i2];
32271
32501
  if (isToolMessage(message)) {
32272
32502
  const content = message.content;
32273
- if (content.some(isScreenshotPart)) {
32274
- screenshotIndices.push(i2);
32503
+ if (content.some(isVisionPart)) {
32504
+ visionIndices.push(i2);
32275
32505
  }
32276
32506
  if (content.some(isAriaTreePart)) {
32277
32507
  ariaTreeIndices.push(i2);
32278
32508
  }
32279
32509
  }
32280
32510
  }
32281
- if (screenshotIndices.length > 2) {
32282
- const toCompress = screenshotIndices.slice(0, screenshotIndices.length - 2);
32283
- for (const idx of toCompress) {
32284
- const message = messages[idx];
32511
+ if (visionIndices.length > 2) {
32512
+ const toCompress = visionIndices.slice(0, visionIndices.length - 2);
32513
+ for (const index of toCompress) {
32514
+ const message = messages[index];
32285
32515
  if (isToolMessage(message)) {
32286
32516
  compressScreenshotMessage(message);
32517
+ compressVisionActionMessage(message);
32287
32518
  compressedCount++;
32288
32519
  }
32289
32520
  }
@@ -32315,6 +32546,24 @@ function compressScreenshotMessage(message) {
32315
32546
  }
32316
32547
  }
32317
32548
  }
32549
+ function compressVisionActionMessage(message) {
32550
+ var _a4;
32551
+ for (const part of message.content) {
32552
+ if (isVisionActionPart(part)) {
32553
+ const typedPart = part;
32554
+ if (((_a4 = typedPart.output) == null ? void 0 : _a4.value) && Array.isArray(typedPart.output.value)) {
32555
+ typedPart.output.value = typedPart.output.value.filter(
32556
+ (item) => item && typeof item === "object" && item.type !== "media"
32557
+ );
32558
+ }
32559
+ if (typedPart.result && Array.isArray(typedPart.result)) {
32560
+ typedPart.result = typedPart.result.filter(
32561
+ (item) => item && typeof item === "object" && item.type !== "media"
32562
+ );
32563
+ }
32564
+ }
32565
+ }
32566
+ }
32318
32567
  function compressAriaTreeMessage(message) {
32319
32568
  var _a4;
32320
32569
  for (const part of message.content) {
@@ -32340,6 +32589,18 @@ function compressAriaTreeMessage(message) {
32340
32589
  init_flowLogger();
32341
32590
 
32342
32591
  // lib/v3/agent/utils/actionMapping.ts
32592
+ var EXCLUDED_OUTPUT_KEYS = ["screenshotBase64"];
32593
+ function stripExcludedKeys(output) {
32594
+ const result = {};
32595
+ for (const [key, value] of Object.entries(output)) {
32596
+ if (!EXCLUDED_OUTPUT_KEYS.includes(
32597
+ key
32598
+ )) {
32599
+ result[key] = value;
32600
+ }
32601
+ }
32602
+ return result;
32603
+ }
32343
32604
  function mapToolResultToActions({
32344
32605
  toolCallName,
32345
32606
  toolResult,
@@ -32405,8 +32666,14 @@ function createStandardAction(toolCallName, toolResult, args, reasoning) {
32405
32666
  return action;
32406
32667
  }
32407
32668
  if (toolCallName !== "ariaTree" && toolResult) {
32408
- const { output } = toolResult;
32409
- Object.assign(action, output);
32669
+ const result = toolResult;
32670
+ const output = result.output;
32671
+ if (output && typeof output === "object" && !Array.isArray(output)) {
32672
+ const cleanedOutput = stripExcludedKeys(
32673
+ output
32674
+ );
32675
+ Object.assign(action, cleanedOutput);
32676
+ }
32410
32677
  }
32411
32678
  return action;
32412
32679
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@browserbasehq/stagehand",
3
- "version": "3.0.8-alpha-091296e438bb2374c8bb10ef6c08283978145ebf",
3
+ "version": "3.0.8-alpha-16d72fb4c4081dd33bf45605d75c27644ea4c00e",
4
4
  "description": "An AI web browsing framework focused on simplicity and extensibility.",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.js",