@browserbasehq/orca 3.0.3-zod-1 → 3.0.5-vertex-test

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/dist/index.d.ts +1270 -1163
  2. package/dist/index.js +43919 -21270
  3. package/package.json +23 -20
  4. package/LICENSE +0 -21
package/dist/index.d.ts CHANGED
@@ -1,10 +1,11 @@
1
1
  import { ZodTypeAny, z, ZodObject, ZodRawShape, ZodError } from 'zod';
2
2
  import * as z3 from 'zod/v3';
3
3
  import { ClientOptions as ClientOptions$2 } from '@anthropic-ai/sdk';
4
+ import { GoogleVertexProviderSettings as GoogleVertexProviderSettings$1 } from '@ai-sdk/google-vertex';
4
5
  import { LanguageModelV2 } from '@ai-sdk/provider';
5
6
  import { ClientOptions as ClientOptions$1 } from 'openai';
6
- import { generateObject, generateText, streamText, streamObject, experimental_generateImage, embed, embedMany, experimental_transcribe, experimental_generateSpeech, ToolSet } from 'ai';
7
7
  import { Client, ClientOptions as ClientOptions$3 } from '@modelcontextprotocol/sdk/client/index.js';
8
+ import { ToolSet, StreamTextResult, ModelMessage, wrapLanguageModel, generateObject, generateText, streamText, streamObject, experimental_generateImage, embed, embedMany, experimental_transcribe, experimental_generateSpeech } from 'ai';
8
9
  import { Page as Page$1 } from 'playwright-core';
9
10
  export { Page as PlaywrightPage } from 'playwright-core';
10
11
  import { Page as Page$2 } from 'puppeteer-core';
@@ -28,33 +29,6 @@ declare const isZod3Schema: (schema: StagehandZodSchema) => schema is z3.ZodType
28
29
  type JsonSchemaDocument = Record<string, unknown>;
29
30
  declare function toJsonSchema(schema: StagehandZodSchema): JsonSchemaDocument;
30
31
 
31
- type AnthropicJsonSchemaObject = {
32
- definitions?: {
33
- MySchema?: {
34
- properties?: Record<string, unknown>;
35
- required?: string[];
36
- };
37
- };
38
- properties?: Record<string, unknown>;
39
- required?: string[];
40
- } & Record<string, unknown>;
41
- interface LLMTool {
42
- type: "function";
43
- name: string;
44
- description: string;
45
- parameters: Record<string, unknown>;
46
- }
47
- type AISDKProvider = (modelName: string) => LanguageModelV2;
48
- type AISDKCustomProvider = (options: {
49
- apiKey: string;
50
- }) => AISDKProvider;
51
- type AvailableModel = "gpt-4.1" | "gpt-4.1-mini" | "gpt-4.1-nano" | "o4-mini" | "o3" | "o3-mini" | "o1" | "o1-mini" | "gpt-4o" | "gpt-4o-mini" | "gpt-4o-2024-08-06" | "gpt-4.5-preview" | "o1-preview" | "claude-3-5-sonnet-latest" | "claude-3-5-sonnet-20241022" | "claude-3-5-sonnet-20240620" | "claude-3-7-sonnet-latest" | "claude-3-7-sonnet-20250219" | "cerebras-llama-3.3-70b" | "cerebras-llama-3.1-8b" | "groq-llama-3.3-70b-versatile" | "groq-llama-3.3-70b-specdec" | "gemini-1.5-flash" | "gemini-1.5-pro" | "gemini-1.5-flash-8b" | "gemini-2.0-flash-lite" | "gemini-2.0-flash" | "gemini-2.5-flash-preview-04-17" | "gemini-2.5-pro-preview-03-25" | string;
52
- type ModelProvider = "openai" | "anthropic" | "cerebras" | "groq" | "google" | "aisdk";
53
- type ClientOptions = ClientOptions$1 | ClientOptions$2;
54
- type ModelConfiguration = AvailableModel | (ClientOptions & {
55
- modelName: AvailableModel;
56
- });
57
-
58
32
  type LogLevel = 0 | 1 | 2;
59
33
  /**
60
34
  * Mapping between numeric log levels and their names
@@ -79,122 +53,6 @@ type LogLine = {
79
53
  };
80
54
  type Logger = (logLine: LogLine) => void;
81
55
 
82
- interface ChatMessage {
83
- role: "system" | "user" | "assistant";
84
- content: ChatMessageContent;
85
- }
86
- type ChatMessageContent = string | (ChatMessageImageContent | ChatMessageTextContent)[];
87
- interface ChatMessageImageContent {
88
- type: string;
89
- image_url?: {
90
- url: string;
91
- };
92
- text?: string;
93
- source?: {
94
- type: string;
95
- media_type: string;
96
- data: string;
97
- };
98
- }
99
- interface ChatMessageTextContent {
100
- type: string;
101
- text: string;
102
- }
103
- declare const AnnotatedScreenshotText = "This is a screenshot of the current page state with the elements annotated on it. Each element id is annotated with a number to the top left of it. Duplicate annotations at the same location are under each other vertically.";
104
- interface ChatCompletionOptions {
105
- messages: ChatMessage[];
106
- temperature?: number;
107
- top_p?: number;
108
- frequency_penalty?: number;
109
- presence_penalty?: number;
110
- image?: {
111
- buffer: Buffer;
112
- description?: string;
113
- };
114
- response_model?: {
115
- name: string;
116
- schema: StagehandZodSchema;
117
- };
118
- tools?: LLMTool[];
119
- tool_choice?: "auto" | "none" | "required";
120
- maxOutputTokens?: number;
121
- requestId?: string;
122
- }
123
- type LLMResponse = {
124
- id: string;
125
- object: string;
126
- created: number;
127
- model: string;
128
- choices: {
129
- index: number;
130
- message: {
131
- role: string;
132
- content: string | null;
133
- tool_calls: {
134
- id: string;
135
- type: string;
136
- function: {
137
- name: string;
138
- arguments: string;
139
- };
140
- }[];
141
- };
142
- finish_reason: string;
143
- }[];
144
- usage: {
145
- prompt_tokens: number;
146
- completion_tokens: number;
147
- total_tokens: number;
148
- };
149
- };
150
- interface CreateChatCompletionOptions {
151
- options: ChatCompletionOptions;
152
- logger: (message: LogLine) => void;
153
- retries?: number;
154
- }
155
- /** Simple usage shape if your LLM returns usage tokens. */
156
- interface LLMUsage {
157
- prompt_tokens: number;
158
- completion_tokens: number;
159
- total_tokens: number;
160
- reasoning_tokens?: number;
161
- cached_input_tokens?: number;
162
- }
163
- /**
164
- * For calls that use a schema: the LLMClient may return { data: T; usage?: LLMUsage }
165
- */
166
- interface LLMParsedResponse<T> {
167
- data: T;
168
- usage?: LLMUsage;
169
- }
170
- declare abstract class LLMClient {
171
- type: "openai" | "anthropic" | "cerebras" | "groq" | (string & {});
172
- modelName: AvailableModel | (string & {});
173
- hasVision: boolean;
174
- clientOptions: ClientOptions;
175
- userProvidedInstructions?: string;
176
- constructor(modelName: AvailableModel, userProvidedInstructions?: string);
177
- abstract createChatCompletion<T>(options: CreateChatCompletionOptions & {
178
- options: {
179
- response_model: {
180
- name: string;
181
- schema: StagehandZodSchema;
182
- };
183
- };
184
- }): Promise<LLMParsedResponse<T>>;
185
- abstract createChatCompletion<T = LLMResponse>(options: CreateChatCompletionOptions): Promise<T>;
186
- generateObject: typeof generateObject;
187
- generateText: typeof generateText;
188
- streamText: typeof streamText;
189
- streamObject: typeof streamObject;
190
- generateImage: typeof experimental_generateImage;
191
- embed: typeof embed;
192
- embedMany: typeof embedMany;
193
- transcribe: typeof experimental_transcribe;
194
- generateSpeech: typeof experimental_generateSpeech;
195
- getLanguageModel?(): LanguageModelV2;
196
- }
197
-
198
56
  /**
199
57
  * CDP transport & session multiplexer
200
58
  *
@@ -305,7 +163,7 @@ declare class Frame implements FrameManager {
305
163
  /** Child frames via Page.getFrameTree */
306
164
  childFrames(): Promise<Frame[]>;
307
165
  /** Wait for a lifecycle state (load/domcontentloaded/networkidle) */
308
- waitForLoadState(state?: "load" | "domcontentloaded" | "networkidle"): Promise<void>;
166
+ waitForLoadState(state?: "load" | "domcontentloaded" | "networkidle", timeoutMs?: number): Promise<void>;
309
167
  /** Simple placeholder for your own locator abstraction */
310
168
  locator(selector: string, options?: {
311
169
  deep?: boolean;
@@ -643,812 +501,142 @@ declare class ConsoleMessage {
643
501
  toString(): string;
644
502
  }
645
503
 
646
- /**
647
- * Response
648
- * -----------------
649
- *
650
- * This module implements a Playwright-inspired response wrapper that exposes
651
- * navigation metadata and helpers for retrieving HTTP response bodies. The
652
- * abstraction is consumed by navigation routines (e.g. `Page.goto`) so callers
653
- * can synchronously inspect status codes, lazily fetch body text, or await the
654
- * network layer finishing the request. The implementation is built directly on
655
- * Chrome DevTools Protocol primitives – it holds the originating `requestId`
656
- * so it can request payloads via `Network.getResponseBody`, and it listens for
657
- * `responseReceivedExtraInfo`, `loadingFinished`, and `loadingFailed` events to
658
- * hydrate the richer header view and resolve callers waiting on completion.
659
- */
504
+ declare class StagehandAPIError extends Error {
505
+ constructor(message: string);
506
+ }
507
+ declare class StagehandAPIUnauthorizedError extends StagehandAPIError {
508
+ constructor(message?: string);
509
+ }
510
+ declare class StagehandHttpError extends StagehandAPIError {
511
+ constructor(message: string);
512
+ }
513
+ declare class StagehandServerError extends StagehandAPIError {
514
+ constructor(message: string);
515
+ }
516
+ declare class StagehandResponseBodyError extends StagehandAPIError {
517
+ constructor();
518
+ }
519
+ declare class StagehandResponseParseError extends StagehandAPIError {
520
+ constructor(message: string);
521
+ }
660
522
 
661
- type ServerAddr = {
662
- ipAddress: string;
663
- port: number;
664
- };
665
- /**
666
- * Thin wrapper around CDP response metadata that mirrors the ergonomics of
667
- * Playwright's `Response` class. The class intentionally keeps the same method
668
- * names so upstream integrations can transition with minimal code changes.
669
- */
670
- declare class Response$1 {
671
- private readonly page;
672
- private readonly session;
673
- private readonly requestId;
674
- private readonly frameId?;
675
- private readonly loaderId?;
676
- private readonly response;
677
- private readonly fromServiceWorkerFlag;
678
- private readonly serverAddress?;
679
- private headersObject;
680
- private headersArrayCache;
681
- private allHeadersCache;
682
- private readonly headerValuesMap;
683
- private finishedDeferred;
684
- private finishedSettled;
685
- private extraInfoHeaders;
686
- private extraInfoHeadersText;
687
- /**
688
- * Build a response wrapper from the CDP notification associated with a
689
- * navigation. The constructor captures the owning page/session so follow-up
690
- * methods (body/text/json) can query CDP on-demand. The `response` payload is
691
- * the raw `Protocol.Network.Response` object emitted by Chrome.
692
- */
693
- constructor(params: {
694
- page: Page;
695
- session: CDPSessionLike;
696
- requestId: string;
697
- frameId?: string;
698
- loaderId?: string;
699
- response: Protocol.Network.Response;
700
- fromServiceWorker: boolean;
701
- });
702
- /** URL associated with the navigation request. */
703
- url(): string;
704
- /** HTTP status code reported by Chrome. */
705
- status(): number;
706
- /** Human-readable status text that accompanied the response. */
707
- statusText(): string;
708
- /** Convenience predicate that checks for 2xx statuses. */
709
- ok(): boolean;
710
- /** Returns the Stagehand frame object that initiated the navigation. */
711
- frame(): Frame | null;
712
- /** Indicates whether the response was serviced by a Service Worker. */
713
- fromServiceWorker(): boolean;
714
- /**
715
- * Returns TLS security metadata when provided by the browser. In practice
716
- * this includes certificate issuer, protocol, and validity interval.
717
- */
718
- securityDetails(): Promise<Protocol.Network.SecurityDetails | null>;
719
- /** Returns the resolved server address for the navigation when available. */
720
- serverAddr(): Promise<ServerAddr | null>;
721
- /**
722
- * Returns the response headers normalised to lowercase keys. Matches the
723
- * behaviour of Playwright's `headers()` by eliding duplicate header entries.
724
- */
725
- headers(): Record<string, string>;
726
- /**
727
- * Returns all headers including those only surfaced through
728
- * `responseReceivedExtraInfo` such as `set-cookie`. Values are reported as the
729
- * browser sends them (no further splitting or concatenation).
730
- */
731
- allHeaders(): Promise<Record<string, string>>;
732
- /** Returns a concatenated header string for the supplied header name. */
733
- headerValue(name: string): Promise<string | null>;
734
- /** Returns all values for a header (case-insensitive lookup). */
735
- headerValues(name: string): Promise<string[]>;
736
- /**
737
- * Returns header entries preserving their original wire casing and ordering.
738
- * Falls back to the CDP object when the raw header text is unavailable.
739
- */
740
- headersArray(): Promise<Array<{
741
- name: string;
742
- value: string;
743
- }>>;
744
- /**
745
- * Requests the raw response body from Chrome DevTools Protocol. The method is
746
- * intentionally lazy because not every caller needs the payload, and CDP only
747
- * allows retrieving it once the response completes.
748
- */
749
- body(): Promise<Buffer>;
750
- /** Decodes the response body as UTF-8 text. */
751
- text(): Promise<string>;
752
- /** Parses the response body as JSON and throws if parsing fails. */
753
- json<T = unknown>(): Promise<T>;
754
- /**
755
- * Resolves once the underlying network request completes or fails. Mirrors
756
- * Playwright's behaviour by resolving to `null` on success and to an `Error`
757
- * instance when Chrome reports `Network.loadingFailed`.
758
- */
759
- finished(): Promise<null | Error>;
760
- /**
761
- * Internal helper invoked by the navigation tracker when CDP reports extra
762
- * header information. This keeps the cached header views in sync with the
763
- * richer metadata.
764
- */
765
- applyExtraInfo(event: Protocol.Network.ResponseReceivedExtraInfoEvent): void;
766
- /**
767
- * Internal helper for creating a Response object from a Serializable
768
- * goto response from the Stagehand API
769
- */
770
- static fromSerializable(serialized: SerializableResponse, context: {
771
- page: Page;
772
- session: CDPSessionLike;
773
- }): Response$1;
774
- /** Marks the response as finished and resolves the `finished()` promise. */
775
- markFinished(error: Error | null): void;
523
+ interface ActOptions {
524
+ model?: ModelConfiguration;
525
+ variables?: Record<string, string>;
526
+ timeout?: number;
527
+ page?: Page$1 | Page$2 | Page$3 | Page;
776
528
  }
777
-
778
- type AnyPage = Page$1 | Page$2 | Page$3 | Page;
779
-
780
- type LoadState = "load" | "domcontentloaded" | "networkidle";
781
-
782
- declare class StagehandAPIClient {
783
- private apiKey;
784
- private projectId;
785
- private sessionId?;
786
- private modelApiKey;
787
- private logger;
788
- private fetchWithCookies;
789
- constructor({ apiKey, projectId, logger }: StagehandAPIConstructorParams);
790
- init({ modelName, modelApiKey, domSettleTimeoutMs, verbose, systemPrompt, selfHeal, browserbaseSessionCreateParams, browserbaseSessionID, }: StartSessionParams): Promise<StartSessionResult>;
791
- act({ input, options, frameId }: APIActParameters): Promise<ActResult>;
792
- extract<T extends StagehandZodSchema>({ instruction, schema: zodSchema, options, frameId, }: APIExtractParameters): Promise<ExtractResult<T>>;
793
- observe({ instruction, options, frameId, }: APIObserveParameters): Promise<Action[]>;
794
- goto(url: string, options?: {
795
- waitUntil?: "load" | "domcontentloaded" | "networkidle";
796
- }, frameId?: string): Promise<SerializableResponse | null>;
797
- agentExecute(agentConfig: AgentConfig, executeOptions: AgentExecuteOptions | string, frameId?: string): Promise<AgentResult>;
798
- end(): Promise<Response>;
799
- getReplayMetrics(): Promise<StagehandMetrics>;
800
- private execute;
801
- private request;
529
+ interface ActResult {
530
+ success: boolean;
531
+ message: string;
532
+ actionDescription: string;
533
+ actions: Action[];
802
534
  }
803
-
804
- type ScreenshotAnimationsOption = "disabled" | "allow";
805
- type ScreenshotCaretOption = "hide" | "initial";
806
- type ScreenshotScaleOption = "css" | "device";
807
- interface ScreenshotClip {
808
- x: number;
809
- y: number;
810
- width: number;
811
- height: number;
535
+ type ExtractResult<T extends StagehandZodSchema> = InferStagehandSchema<T>;
536
+ interface Action {
537
+ selector: string;
538
+ description: string;
539
+ method?: string;
540
+ arguments?: string[];
812
541
  }
813
- interface ScreenshotOptions {
814
- animations?: ScreenshotAnimationsOption;
815
- caret?: ScreenshotCaretOption;
816
- clip?: ScreenshotClip;
817
- fullPage?: boolean;
818
- mask?: Locator[];
819
- maskColor?: string;
820
- omitBackground?: boolean;
821
- path?: string;
822
- quality?: number;
823
- scale?: ScreenshotScaleOption;
824
- style?: string;
542
+ interface HistoryEntry {
543
+ method: "act" | "extract" | "observe" | "navigate" | "agent";
544
+ parameters: unknown;
545
+ result: unknown;
546
+ timestamp: string;
547
+ }
548
+ interface ExtractOptions {
549
+ model?: ModelConfiguration;
825
550
  timeout?: number;
826
- type?: "png" | "jpeg";
551
+ selector?: string;
552
+ page?: Page$1 | Page$2 | Page$3 | Page;
553
+ }
554
+ declare const defaultExtractSchema: z.ZodObject<{
555
+ extraction: z.ZodString;
556
+ }, z.core.$strip>;
557
+ declare const pageTextSchema: z.ZodObject<{
558
+ pageText: z.ZodString;
559
+ }, z.core.$strip>;
560
+ interface ObserveOptions {
561
+ model?: ModelConfiguration;
562
+ timeout?: number;
563
+ selector?: string;
564
+ page?: Page$1 | Page$2 | Page$3 | Page;
565
+ }
566
+ declare enum V3FunctionName {
567
+ ACT = "ACT",
568
+ EXTRACT = "EXTRACT",
569
+ OBSERVE = "OBSERVE",
570
+ AGENT = "AGENT"
827
571
  }
828
572
 
829
- declare class Page {
830
- private readonly conn;
831
- private readonly mainSession;
832
- private readonly _targetId;
833
- /** Every CDP child session this page owns (top-level + adopted OOPIF sessions). */
834
- private readonly sessions;
835
- /** Unified truth for frame topology + ownership. */
836
- private readonly registry;
837
- /** A convenience wrapper bound to the current main frame id (top-level session). */
838
- private mainFrameWrapper;
839
- /** Compact ordinal per frameId (used by snapshot encoding). */
840
- private frameOrdinals;
841
- private nextOrdinal;
842
- /** cache Frames per frameId so everyone uses the same one */
843
- private readonly frameCache;
844
- private readonly browserIsRemote;
845
- /** Stable id for Frames created by this Page (use top-level TargetId). */
846
- private readonly pageId;
847
- /** Cached current URL for synchronous page.url() */
848
- private _currentUrl;
849
- private navigationCommandSeq;
850
- private latestNavigationCommandId;
851
- private readonly networkManager;
852
- /** Optional API client for routing page operations to the API */
853
- private readonly apiClient;
854
- private readonly consoleListeners;
855
- private readonly consoleHandlers;
856
- private constructor();
857
- private cursorEnabled;
858
- private ensureCursorScript;
859
- enableCursorOverlay(): Promise<void>;
860
- private updateCursor;
861
- /**
862
- * Factory: create Page and seed registry with the shallow tree from Page.getFrameTree.
863
- * Assumes Page domain is already enabled on the session passed in.
864
- */
865
- static create(conn: CdpConnection, session: CDPSessionLike, targetId: string, apiClient?: StagehandAPIClient | null, localBrowserLaunchOptions?: LocalBrowserLaunchOptions | null, browserIsRemote?: boolean): Promise<Page>;
866
- /**
867
- * Parent/child session emitted a `frameAttached`.
868
- * Topology update + ownership stamped to **emitting session**.
869
- */
870
- onFrameAttached(frameId: string, parentId: string | null, session: CDPSessionLike): void;
871
- /**
872
- * Parent/child session emitted a `frameDetached`.
873
- */
874
- onFrameDetached(frameId: string, reason?: "remove" | "swap" | string): void;
875
- /**
876
- * Parent/child session emitted a `frameNavigated`.
877
- * Topology + ownership update. Handles root swaps.
878
- */
879
- onFrameNavigated(frame: Protocol.Page.Frame, session: CDPSessionLike): void;
880
- onNavigatedWithinDocument(frameId: string, url: string, session: CDPSessionLike): void;
573
+ interface StagehandMetrics {
574
+ actPromptTokens: number;
575
+ actCompletionTokens: number;
576
+ actReasoningTokens: number;
577
+ actCachedInputTokens: number;
578
+ actInferenceTimeMs: number;
579
+ extractPromptTokens: number;
580
+ extractCompletionTokens: number;
581
+ extractReasoningTokens: number;
582
+ extractCachedInputTokens: number;
583
+ extractInferenceTimeMs: number;
584
+ observePromptTokens: number;
585
+ observeCompletionTokens: number;
586
+ observeReasoningTokens: number;
587
+ observeCachedInputTokens: number;
588
+ observeInferenceTimeMs: number;
589
+ agentPromptTokens: number;
590
+ agentCompletionTokens: number;
591
+ agentReasoningTokens: number;
592
+ agentCachedInputTokens: number;
593
+ agentInferenceTimeMs: number;
594
+ totalPromptTokens: number;
595
+ totalCompletionTokens: number;
596
+ totalReasoningTokens: number;
597
+ totalCachedInputTokens: number;
598
+ totalInferenceTimeMs: number;
599
+ }
600
+
601
+ type V3Env = "LOCAL" | "BROWSERBASE";
602
+ /** Local launch options for V3 (chrome-launcher + CDP).
603
+ * Matches v2 shape where feasible; unsupported fields are accepted but ignored.
604
+ */
605
+ interface LocalBrowserLaunchOptions {
606
+ args?: string[];
607
+ executablePath?: string;
608
+ userDataDir?: string;
609
+ preserveUserDataDir?: boolean;
610
+ headless?: boolean;
611
+ devtools?: boolean;
612
+ chromiumSandbox?: boolean;
613
+ ignoreDefaultArgs?: boolean | string[];
614
+ proxy?: {
615
+ server: string;
616
+ bypass?: string;
617
+ username?: string;
618
+ password?: string;
619
+ };
620
+ locale?: string;
621
+ viewport?: {
622
+ width: number;
623
+ height: number;
624
+ };
625
+ deviceScaleFactor?: number;
626
+ hasTouch?: boolean;
627
+ ignoreHTTPSErrors?: boolean;
628
+ cdpUrl?: string;
629
+ connectTimeoutMs?: number;
630
+ downloadsPath?: string;
631
+ acceptDownloads?: boolean;
632
+ }
633
+ /** Constructor options for V3 */
634
+ interface V3Options {
635
+ env: V3Env;
636
+ apiKey?: string;
637
+ projectId?: string;
881
638
  /**
882
- * An OOPIF child session whose **main** frame id equals the parent iframe’s frameId
883
- * has been attached; adopt the session into this Page and seed ownership for its subtree.
884
- */
885
- adoptOopifSession(childSession: CDPSessionLike, childMainFrameId: string): void;
886
- /** Detach an adopted child session and prune its subtree */
887
- detachOopifSession(sessionId: string): void;
888
- /** Return the owning CDP session for a frameId (falls back to main session) */
889
- getSessionForFrame(frameId: string): CDPSessionLike;
890
- /** Always returns a Frame bound to the owning session */
891
- frameForId(frameId: string): Frame;
892
- /** Expose a session by id (used by snapshot to resolve session id -> session) */
893
- getSessionById(id: string): CDPSessionLike | undefined;
894
- registerSessionForNetwork(session: CDPSessionLike): void;
895
- unregisterSessionForNetwork(sessionId: string | undefined): void;
896
- on(event: "console", listener: ConsoleListener): Page;
897
- once(event: "console", listener: ConsoleListener): Page;
898
- off(event: "console", listener: ConsoleListener): Page;
899
- targetId(): string;
900
- /**
901
- * Send a CDP command through the main session.
902
- * Allows external consumers to execute arbitrary Chrome DevTools Protocol commands.
903
- *
904
- * @param method - The CDP method name (e.g., "Page.enable", "Runtime.evaluate")
905
- * @param params - Optional parameters for the CDP command
906
- * @returns Promise resolving to the typed CDP response
907
- *
908
- * @example
909
- * // Enable the Runtime domain
910
- * await page.sendCDP("Runtime.enable");
911
- *
912
- * @example
913
- * // Evaluate JavaScript with typed response
914
- * const result = await page.sendCDP<Protocol.Runtime.EvaluateResponse>(
915
- * "Runtime.evaluate",
916
- * { expression: "1 + 1" }
917
- * );
918
- */
919
- sendCDP<T = unknown>(method: string, params?: object): Promise<T>;
920
- /** Seed the cached URL before navigation events converge. */
921
- seedCurrentUrl(url: string | undefined | null): void;
922
- mainFrameId(): string;
923
- mainFrame(): Frame;
924
- /**
925
- * Close this top-level page (tab). Best-effort via Target.closeTarget.
926
- */
927
- close(): Promise<void>;
928
- getFullFrameTree(): Protocol.Page.FrameTree;
929
- asProtocolFrameTree(rootMainFrameId: string): Protocol.Page.FrameTree;
930
- private ensureOrdinal;
931
- /** Public getter for snapshot code / handlers. */
932
- getOrdinal(frameId: string): number;
933
- listAllFrameIds(): string[];
934
- private ensureConsoleTaps;
935
- private installConsoleTap;
936
- private sessionKey;
937
- private resolveSessionByKey;
938
- private teardownConsoleTap;
939
- private removeAllConsoleTaps;
940
- private emitConsole;
941
- /**
942
- * Navigate the page; optionally wait for a lifecycle state.
943
- * Waits on the **current** main frame and follows root swaps during navigation.
944
- */
945
- goto(url: string, options?: {
946
- waitUntil?: LoadState;
947
- timeoutMs?: number;
948
- }): Promise<Response$1 | null>;
949
- /**
950
- * Reload the page; optionally wait for a lifecycle state.
951
- */
952
- reload(options?: {
953
- waitUntil?: LoadState;
954
- timeoutMs?: number;
955
- ignoreCache?: boolean;
956
- }): Promise<Response$1 | null>;
957
- /**
958
- * Navigate back in history if possible; optionally wait for a lifecycle state.
959
- */
960
- goBack(options?: {
961
- waitUntil?: LoadState;
962
- timeoutMs?: number;
963
- }): Promise<Response$1 | null>;
964
- /**
965
- * Navigate forward in history if possible; optionally wait for a lifecycle state.
966
- */
967
- goForward(options?: {
968
- waitUntil?: LoadState;
969
- timeoutMs?: number;
970
- }): Promise<Response$1 | null>;
971
- /**
972
- * Return the current page URL (synchronous, cached from navigation events).
973
- */
974
- url(): string;
975
- private beginNavigationCommand;
976
- isCurrentNavigationCommand(id: number): boolean;
977
- /**
978
- * Return the current page title.
979
- * Prefers reading from the active document via Runtime.evaluate to reflect dynamic changes.
980
- * Falls back to navigation history title if evaluation is unavailable.
981
- */
982
- title(): Promise<string>;
983
- /**
984
- * Capture a screenshot with Playwright-style options.
985
- *
986
- * @param options Optional screenshot configuration.
987
- * @param options.animations Control CSS/Web animations during capture. Use
988
- * "disabled" to fast-forward finite animations and pause infinite ones.
989
- * @param options.caret Either hide the text caret (default) or leave it
990
- * visible via "initial".
991
- * @param options.clip Restrict capture to a specific rectangle (in CSS
992
- * pixels). Cannot be combined with `fullPage`.
993
- * @param options.fullPage Capture the full scrollable page instead of the
994
- * current viewport.
995
- * @param options.mask Array of locators that should be covered with an
996
- * overlay while the screenshot is taken.
997
- * @param options.maskColor CSS color used for the mask overlay (default
998
- * `#FF00FF`).
999
- * @param options.omitBackground Make the default page background transparent
1000
- * (PNG only).
1001
- * @param options.path File path to write the screenshot to. The file extension
1002
- * determines the image type when `type` is not explicitly provided.
1003
- * @param options.quality JPEG quality (0–100). Only applies when
1004
- * `type === "jpeg"`.
1005
- * @param options.scale Render scale: use "css" for one pixel per CSS pixel,
1006
- * otherwise the default "device" leverages the current device pixel ratio.
1007
- * @param options.style Additional CSS text injected into every frame before
1008
- * capture (removed afterwards).
1009
- * @param options.timeout Maximum capture duration in milliseconds before a
1010
- * timeout error is thrown.
1011
- * @param options.type Image format (`"png"` by default).
1012
- */
1013
- screenshot(options?: ScreenshotOptions): Promise<Buffer>;
1014
- /**
1015
- * Create a locator bound to the current main frame.
1016
- */
1017
- locator(selector: string): ReturnType<Frame["locator"]>;
1018
- /**
1019
- * Deep locator that supports cross-iframe traversal.
1020
- * - Recognizes '>>' hop notation to enter iframe contexts.
1021
- * - Supports deep XPath that includes iframe steps (e.g., '/html/body/iframe[2]//div').
1022
- * Returns a Locator scoped to the appropriate frame.
1023
- */
1024
- deepLocator(selector: string): DeepLocatorDelegate;
1025
- /**
1026
- * Frame locator similar to Playwright: targets iframe elements and scopes
1027
- * subsequent locators to that frame. Supports chaining.
1028
- */
1029
- frameLocator(selector: string): FrameLocator;
1030
- /**
1031
- * List all frames belonging to this page as Frame objects bound to their owning sessions.
1032
- * The list is ordered by a stable ordinal assigned during the page lifetime.
1033
- */
1034
- frames(): Frame[];
1035
- /**
1036
- * Wait until the page reaches a lifecycle state on the current main frame.
1037
- * Mirrors Playwright's API signatures.
1038
- */
1039
- waitForLoadState(state: LoadState, timeoutMs?: number): Promise<void>;
1040
- /**
1041
- * Evaluate a function or expression in the current main frame's isolated world.
1042
- * - If a string is provided, it is treated as a JS expression.
1043
- * - If a function is provided, it is stringified and invoked with the optional argument.
1044
- * - The return value should be JSON-serializable. Non-serializable objects will
1045
- * best-effort serialize via JSON.stringify inside the page context.
1046
- */
1047
- evaluate<R = unknown, Arg = unknown>(pageFunctionOrExpression: string | ((arg: Arg) => R | Promise<R>), arg?: Arg): Promise<R>;
1048
- /**
1049
- * Force the page viewport to an exact CSS size and device scale factor.
1050
- * Ensures screenshots match width x height pixels when deviceScaleFactor = 1.
1051
- */
1052
- setViewportSize(width: number, height: number, options?: {
1053
- deviceScaleFactor?: number;
1054
- }): Promise<void>;
1055
- /**
1056
- * Click at absolute page coordinates (CSS pixels).
1057
- * Dispatches mouseMoved → mousePressed → mouseReleased via CDP Input domain
1058
- * on the top-level page target's session. Coordinates are relative to the
1059
- * viewport origin (top-left). Does not scroll.
1060
- */
1061
- click(x: number, y: number, options: {
1062
- button?: "left" | "right" | "middle";
1063
- clickCount?: number;
1064
- returnXpath: true;
1065
- }): Promise<string>;
1066
- click(x: number, y: number, options?: {
1067
- button?: "left" | "right" | "middle";
1068
- clickCount?: number;
1069
- returnXpath?: false;
1070
- }): Promise<void>;
1071
- click(x: number, y: number, options: {
1072
- button?: "left" | "right" | "middle";
1073
- clickCount?: number;
1074
- returnXpath: boolean;
1075
- }): Promise<void | string>;
1076
- scroll(x: number, y: number, deltaX: number, deltaY: number, options: {
1077
- returnXpath: true;
1078
- }): Promise<string>;
1079
- scroll(x: number, y: number, deltaX: number, deltaY: number, options?: {
1080
- returnXpath?: false;
1081
- }): Promise<void>;
1082
- scroll(x: number, y: number, deltaX: number, deltaY: number, options: {
1083
- returnXpath: boolean;
1084
- }): Promise<void | string>;
1085
- /**
1086
- * Drag from (fromX, fromY) to (toX, toY) using mouse events.
1087
- * Sends mouseMoved → mousePressed → mouseMoved (steps) → mouseReleased.
1088
- */
1089
- dragAndDrop(fromX: number, fromY: number, toX: number, toY: number, options: {
1090
- button?: "left" | "right" | "middle";
1091
- steps?: number;
1092
- delay?: number;
1093
- returnXpath: true;
1094
- }): Promise<[string, string]>;
1095
- dragAndDrop(fromX: number, fromY: number, toX: number, toY: number, options?: {
1096
- button?: "left" | "right" | "middle";
1097
- steps?: number;
1098
- delay?: number;
1099
- returnXpath?: false;
1100
- }): Promise<void>;
1101
- dragAndDrop(fromX: number, fromY: number, toX: number, toY: number, options: {
1102
- button?: "left" | "right" | "middle";
1103
- steps?: number;
1104
- delay?: number;
1105
- returnXpath: boolean;
1106
- }): Promise<void | [string, string]>;
1107
- /**
1108
- * Type a string by dispatching keyDown/keyUp events per character.
1109
- * Focus must already be on the desired element. Uses CDP Input.dispatchKeyEvent
1110
- * and never falls back to Input.insertText. Optional delay applies between
1111
- * successive characters.
1112
- */
1113
- type(text: string, options?: {
1114
- delay?: number;
1115
- withMistakes?: boolean;
1116
- }): Promise<void>;
1117
- /**
1118
- * Press a single key or key combination (keyDown then keyUp).
1119
- * For printable characters, uses the text path on keyDown; for named keys, sets key/code/VK.
1120
- * Supports key combinations with modifiers like "Cmd+A", "Ctrl+C", "Shift+Tab", etc.
1121
- */
1122
- keyPress(key: string, options?: {
1123
- delay?: number;
1124
- }): Promise<void>;
1125
- private _pressedModifiers;
1126
- /** Press a key down without releasing it */
1127
- private keyDown;
1128
- /** Release a pressed key */
1129
- private keyUp;
1130
- /** Normalize modifier key names to match CDP expectations */
1131
- private normalizeModifierKey;
1132
- /**
1133
- * Get the map of named keys with their properties
1134
- */
1135
- private getNamedKeys;
1136
- /**
1137
- * Minimal description for printable keys (letters/digits/space) to provide code and VK.
1138
- * Used when non-Shift modifiers are pressed to avoid sending text while keeping accelerator info.
1139
- */
1140
- private describePrintableKey;
1141
- private isMacOS;
1142
- /**
1143
- * Return Chromium mac editing commands (without trailing ':') for a given code like 'KeyA'
1144
- * Only used on macOS to trigger system editing shortcuts (e.g., selectAll, copy, paste...).
1145
- */
1146
- private macCommandsFor;
1147
- /**
1148
- * Create an isolated world for the **current** main frame and return its context id.
1149
- */
1150
- private createIsolatedWorldForCurrentMain;
1151
- /**
1152
- * Wait until the **current** main frame reaches a lifecycle state.
1153
- * - Fast path via `document.readyState`.
1154
- * - Event path listens at the session level and compares incoming `frameId`
1155
- * to `mainFrameId()` **at event time** to follow root swaps.
1156
- */
1157
- waitForMainLoadState(state: LoadState, timeoutMs?: number): Promise<void>;
1158
- }
1159
-
1160
- interface AgentAction {
1161
- type: string;
1162
- reasoning?: string;
1163
- taskCompleted?: boolean;
1164
- action?: string;
1165
- timeMs?: number;
1166
- pageText?: string;
1167
- pageUrl?: string;
1168
- instruction?: string;
1169
- [key: string]: unknown;
1170
- }
1171
- interface AgentResult {
1172
- success: boolean;
1173
- message: string;
1174
- actions: AgentAction[];
1175
- completed: boolean;
1176
- metadata?: Record<string, unknown>;
1177
- usage?: {
1178
- input_tokens: number;
1179
- output_tokens: number;
1180
- reasoning_tokens?: number;
1181
- cached_input_tokens?: number;
1182
- inference_time_ms: number;
1183
- };
1184
- }
1185
- interface AgentExecuteOptions {
1186
- instruction: string;
1187
- maxSteps?: number;
1188
- page?: Page$1 | Page$2 | Page$3 | Page;
1189
- highlightCursor?: boolean;
1190
- }
1191
- type AgentType = "openai" | "anthropic" | "google";
1192
- declare const AVAILABLE_CUA_MODELS: readonly ["openai/computer-use-preview", "openai/computer-use-preview-2025-03-11", "anthropic/claude-3-7-sonnet-latest", "anthropic/claude-haiku-4-5-20251001", "anthropic/claude-sonnet-4-20250514", "anthropic/claude-sonnet-4-5-20250929", "google/gemini-2.5-computer-use-preview-10-2025"];
1193
- type AvailableCuaModel = (typeof AVAILABLE_CUA_MODELS)[number];
1194
- interface AgentExecutionOptions<TOptions extends AgentExecuteOptions = AgentExecuteOptions> {
1195
- options: TOptions;
1196
- logger: (message: LogLine) => void;
1197
- retries?: number;
1198
- }
1199
- interface AgentHandlerOptions {
1200
- modelName: string;
1201
- clientOptions?: Record<string, unknown>;
1202
- userProvidedInstructions?: string;
1203
- experimental?: boolean;
1204
- }
1205
- interface ActionExecutionResult {
1206
- success: boolean;
1207
- error?: string;
1208
- data?: unknown;
1209
- }
1210
- interface ToolUseItem extends ResponseItem {
1211
- type: "tool_use";
1212
- id: string;
1213
- name: string;
1214
- input: Record<string, unknown>;
1215
- }
1216
- interface AnthropicMessage {
1217
- role: string;
1218
- content: string | Array<AnthropicContentBlock>;
1219
- }
1220
- interface AnthropicContentBlock {
1221
- type: string;
1222
- [key: string]: unknown;
1223
- }
1224
- interface AnthropicTextBlock extends AnthropicContentBlock {
1225
- type: "text";
1226
- text: string;
1227
- }
1228
- interface AnthropicToolResult {
1229
- type: "tool_result";
1230
- tool_use_id: string;
1231
- content: string | Array<AnthropicContentBlock>;
1232
- }
1233
- interface ResponseItem {
1234
- type: string;
1235
- id: string;
1236
- [key: string]: unknown;
1237
- }
1238
- interface ComputerCallItem extends ResponseItem {
1239
- type: "computer_call";
1240
- call_id: string;
1241
- action: {
1242
- type: string;
1243
- [key: string]: unknown;
1244
- };
1245
- pending_safety_checks?: Array<{
1246
- id: string;
1247
- code: string;
1248
- message: string;
1249
- }>;
1250
- }
1251
- interface FunctionCallItem extends ResponseItem {
1252
- type: "function_call";
1253
- call_id: string;
1254
- name: string;
1255
- arguments: string;
1256
- }
1257
- type ResponseInputItem = {
1258
- role: string;
1259
- content: string;
1260
- } | {
1261
- type: "computer_call_output";
1262
- call_id: string;
1263
- output: {
1264
- type: "input_image";
1265
- image_url: string;
1266
- current_url?: string;
1267
- error?: string;
1268
- [key: string]: unknown;
1269
- } | string;
1270
- acknowledged_safety_checks?: Array<{
1271
- id: string;
1272
- code: string;
1273
- message: string;
1274
- }>;
1275
- } | {
1276
- type: "function_call_output";
1277
- call_id: string;
1278
- output: string;
1279
- };
1280
- interface AgentInstance {
1281
- execute: (instructionOrOptions: string | AgentExecuteOptions) => Promise<AgentResult>;
1282
- }
1283
- type AgentProviderType = AgentType;
1284
- type AgentModelConfig<TModelName extends string = string> = {
1285
- modelName: TModelName;
1286
- } & Record<string, unknown>;
1287
- type AgentConfig = {
1288
- /**
1289
- * Custom system prompt to provide to the agent. Overrides the default system prompt.
1290
- */
1291
- systemPrompt?: string;
1292
- /**
1293
- * MCP integrations - Array of Client objects
1294
- */
1295
- integrations?: (Client | string)[];
1296
- /**
1297
- * Tools passed to the agent client
1298
- */
1299
- tools?: ToolSet;
1300
- /**
1301
- * Indicates CUA is disabled for this configuration
1302
- */
1303
- cua?: boolean;
1304
- /**
1305
- * The model to use for agent functionality
1306
- */
1307
- model?: string | AgentModelConfig<string>;
1308
- /**
1309
- * The model to use for tool execution (observe/act calls within agent tools).
1310
- * If not specified, inherits from the main model configuration.
1311
- * Format: "provider/model" (e.g., "openai/gpt-4o-mini", "google/gemini-2.0-flash-exp")
1312
- */
1313
- executionModel?: string | AgentModelConfig<string>;
1314
- };
1315
-
1316
- declare class StagehandAPIError extends Error {
1317
- constructor(message: string);
1318
- }
1319
- declare class StagehandAPIUnauthorizedError extends StagehandAPIError {
1320
- constructor(message?: string);
1321
- }
1322
- declare class StagehandHttpError extends StagehandAPIError {
1323
- constructor(message: string);
1324
- }
1325
- declare class StagehandServerError extends StagehandAPIError {
1326
- constructor(message: string);
1327
- }
1328
- declare class StagehandResponseBodyError extends StagehandAPIError {
1329
- constructor();
1330
- }
1331
- declare class StagehandResponseParseError extends StagehandAPIError {
1332
- constructor(message: string);
1333
- }
1334
-
1335
- interface ActOptions {
1336
- model?: ModelConfiguration;
1337
- variables?: Record<string, string>;
1338
- timeout?: number;
1339
- page?: Page$1 | Page$2 | Page$3 | Page;
1340
- }
1341
- interface ActResult {
1342
- success: boolean;
1343
- message: string;
1344
- actionDescription: string;
1345
- actions: Action[];
1346
- }
1347
- type ExtractResult<T extends StagehandZodSchema> = InferStagehandSchema<T>;
1348
- interface Action {
1349
- selector: string;
1350
- description: string;
1351
- method?: string;
1352
- arguments?: string[];
1353
- }
1354
- interface HistoryEntry {
1355
- method: "act" | "extract" | "observe" | "navigate" | "agent";
1356
- parameters: unknown;
1357
- result: unknown;
1358
- timestamp: string;
1359
- }
1360
- interface ExtractOptions {
1361
- model?: ModelConfiguration;
1362
- timeout?: number;
1363
- selector?: string;
1364
- page?: Page$1 | Page$2 | Page$3 | Page;
1365
- }
1366
- declare const defaultExtractSchema: z.ZodObject<{
1367
- extraction: z.ZodString;
1368
- }, z.core.$strip>;
1369
- declare const pageTextSchema: z.ZodObject<{
1370
- pageText: z.ZodString;
1371
- }, z.core.$strip>;
1372
- interface ObserveOptions {
1373
- model?: ModelConfiguration;
1374
- timeout?: number;
1375
- selector?: string;
1376
- page?: Page$1 | Page$2 | Page$3 | Page;
1377
- }
1378
- declare enum V3FunctionName {
1379
- ACT = "ACT",
1380
- EXTRACT = "EXTRACT",
1381
- OBSERVE = "OBSERVE",
1382
- AGENT = "AGENT"
1383
- }
1384
-
1385
- interface StagehandMetrics {
1386
- actPromptTokens: number;
1387
- actCompletionTokens: number;
1388
- actReasoningTokens: number;
1389
- actCachedInputTokens: number;
1390
- actInferenceTimeMs: number;
1391
- extractPromptTokens: number;
1392
- extractCompletionTokens: number;
1393
- extractReasoningTokens: number;
1394
- extractCachedInputTokens: number;
1395
- extractInferenceTimeMs: number;
1396
- observePromptTokens: number;
1397
- observeCompletionTokens: number;
1398
- observeReasoningTokens: number;
1399
- observeCachedInputTokens: number;
1400
- observeInferenceTimeMs: number;
1401
- agentPromptTokens: number;
1402
- agentCompletionTokens: number;
1403
- agentReasoningTokens: number;
1404
- agentCachedInputTokens: number;
1405
- agentInferenceTimeMs: number;
1406
- totalPromptTokens: number;
1407
- totalCompletionTokens: number;
1408
- totalReasoningTokens: number;
1409
- totalCachedInputTokens: number;
1410
- totalInferenceTimeMs: number;
1411
- }
1412
-
1413
- type V3Env = "LOCAL" | "BROWSERBASE";
1414
- /** Local launch options for V3 (chrome-launcher + CDP).
1415
- * Matches v2 shape where feasible; unsupported fields are accepted but ignored.
1416
- */
1417
- interface LocalBrowserLaunchOptions {
1418
- args?: string[];
1419
- executablePath?: string;
1420
- userDataDir?: string;
1421
- preserveUserDataDir?: boolean;
1422
- headless?: boolean;
1423
- devtools?: boolean;
1424
- chromiumSandbox?: boolean;
1425
- ignoreDefaultArgs?: boolean | string[];
1426
- proxy?: {
1427
- server: string;
1428
- bypass?: string;
1429
- username?: string;
1430
- password?: string;
1431
- };
1432
- locale?: string;
1433
- viewport?: {
1434
- width: number;
1435
- height: number;
1436
- };
1437
- deviceScaleFactor?: number;
1438
- hasTouch?: boolean;
1439
- ignoreHTTPSErrors?: boolean;
1440
- cdpUrl?: string;
1441
- connectTimeoutMs?: number;
1442
- downloadsPath?: string;
1443
- acceptDownloads?: boolean;
1444
- }
1445
- /** Constructor options for V3 */
1446
- interface V3Options {
1447
- env: V3Env;
1448
- apiKey?: string;
1449
- projectId?: string;
1450
- /**
1451
- * Optional: fine-tune Browserbase session creation or resume an existing session.
639
+ * Optional: fine-tune Browserbase session creation or resume an existing session.
1452
640
  */
1453
641
  browserbaseSessionCreateParams?: Omit<Browserbase.Sessions.SessionCreateParams, "projectId"> & {
1454
642
  projectId?: string;
@@ -1606,279 +794,1177 @@ declare class AISdkClient extends LLMClient {
1606
794
  createChatCompletion<T = ChatCompletion>({ options, }: CreateChatCompletionOptions): Promise<T>;
1607
795
  }
1608
796
 
1609
- interface StagehandAPIConstructorParams {
1610
- apiKey: string;
1611
- projectId: string;
1612
- logger: (message: LogLine) => void;
1613
- }
1614
- interface StartSessionParams {
1615
- modelName: string;
1616
- modelApiKey: string;
1617
- domSettleTimeoutMs: number;
1618
- verbose: number;
1619
- systemPrompt?: string;
1620
- browserbaseSessionCreateParams?: Omit<Browserbase.Sessions.SessionCreateParams, "projectId"> & {
1621
- projectId?: string;
1622
- };
1623
- selfHeal?: boolean;
1624
- browserbaseSessionID?: string;
797
+ interface StagehandAPIConstructorParams {
798
+ apiKey: string;
799
+ projectId: string;
800
+ logger: (message: LogLine) => void;
801
+ }
802
+ interface StartSessionParams {
803
+ modelName: string;
804
+ modelApiKey: string;
805
+ domSettleTimeoutMs: number;
806
+ verbose: number;
807
+ systemPrompt?: string;
808
+ browserbaseSessionCreateParams?: Omit<Browserbase.Sessions.SessionCreateParams, "projectId"> & {
809
+ projectId?: string;
810
+ };
811
+ selfHeal?: boolean;
812
+ browserbaseSessionID?: string;
813
+ }
814
+ interface StartSessionResult {
815
+ sessionId: string;
816
+ available?: boolean;
817
+ }
818
+ interface APIActParameters {
819
+ input: string | Action;
820
+ options?: ActOptions;
821
+ frameId?: string;
822
+ }
823
+ interface APIExtractParameters {
824
+ instruction?: string;
825
+ schema?: StagehandZodSchema;
826
+ options?: ExtractOptions;
827
+ frameId?: string;
828
+ }
829
+ interface APIObserveParameters {
830
+ instruction?: string;
831
+ options?: ObserveOptions;
832
+ frameId?: string;
833
+ }
834
+ interface SerializableResponse {
835
+ requestId: string;
836
+ frameId?: string;
837
+ loaderId?: string;
838
+ response: Protocol.Network.Response;
839
+ fromServiceWorkerFlag?: boolean;
840
+ finishedSettled?: boolean;
841
+ extraInfoHeaders?: Protocol.Network.Headers | null;
842
+ extraInfoHeadersText?: string;
843
+ }
844
+
845
+ /**
846
+ * Represents a path through a Zod schema from the root object down to a
847
+ * particular field. The `segments` array describes the chain of keys/indices.
848
+ *
849
+ * - **String** segments indicate object property names.
850
+ * - **Number** segments indicate array indices.
851
+ *
852
+ * For example, `["users", 0, "homepage"]` might describe reaching
853
+ * the `homepage` field in `schema.users[0].homepage`.
854
+ */
855
+ interface ZodPathSegments {
856
+ /**
857
+ * The ordered list of keys/indices leading from the schema root
858
+ * to the targeted field.
859
+ */
860
+ segments: Array<string | number>;
861
+ }
862
+
863
+ type EvaluateOptions = {
864
+ /** The question to ask about the task state */
865
+ question: string;
866
+ /** The answer to the question */
867
+ answer?: string;
868
+ /** Whether to take a screenshot of the task state, or array of screenshots to evaluate */
869
+ screenshot?: boolean | Buffer[];
870
+ /** Custom system prompt for the evaluator */
871
+ systemPrompt?: string;
872
+ /** Delay in milliseconds before taking the screenshot @default 250 */
873
+ screenshotDelayMs?: number;
874
+ /** The agent's reasoning/thought process for completing the task */
875
+ agentReasoning?: string;
876
+ };
877
+ type BatchAskOptions = {
878
+ /** Array of questions with optional answers */
879
+ questions: Array<{
880
+ question: string;
881
+ answer?: string;
882
+ }>;
883
+ /** Whether to take a screenshot of the task state */
884
+ screenshot?: boolean;
885
+ /** Custom system prompt for the evaluator */
886
+ systemPrompt?: string;
887
+ /** Delay in milliseconds before taking the screenshot @default 1000 */
888
+ screenshotDelayMs?: number;
889
+ };
890
+ /**
891
+ * Result of an evaluation
892
+ */
893
+ interface EvaluationResult {
894
+ /**
895
+ * The evaluation result ('YES', 'NO', or 'INVALID' if parsing failed or value was unexpected)
896
+ */
897
+ evaluation: "YES" | "NO" | "INVALID";
898
+ /**
899
+ * The reasoning behind the evaluation
900
+ */
901
+ reasoning: string;
902
+ }
903
+
904
+ declare class StagehandAPIClient {
905
+ private apiKey;
906
+ private projectId;
907
+ private sessionId?;
908
+ private modelApiKey;
909
+ private logger;
910
+ private fetchWithCookies;
911
+ constructor({ apiKey, projectId, logger }: StagehandAPIConstructorParams);
912
+ init({ modelName, modelApiKey, domSettleTimeoutMs, verbose, systemPrompt, selfHeal, browserbaseSessionCreateParams, browserbaseSessionID, }: StartSessionParams): Promise<StartSessionResult>;
913
+ act({ input, options, frameId }: APIActParameters): Promise<ActResult>;
914
+ extract<T extends StagehandZodSchema>({ instruction, schema: zodSchema, options, frameId, }: APIExtractParameters): Promise<ExtractResult<T>>;
915
+ observe({ instruction, options, frameId, }: APIObserveParameters): Promise<Action[]>;
916
+ goto(url: string, options?: {
917
+ waitUntil?: "load" | "domcontentloaded" | "networkidle";
918
+ }, frameId?: string): Promise<SerializableResponse | null>;
919
+ agentExecute(agentConfig: AgentConfig, executeOptions: AgentExecuteOptions | string, frameId?: string): Promise<AgentResult>;
920
+ end(): Promise<Response>;
921
+ getReplayMetrics(): Promise<StagehandMetrics>;
922
+ private execute;
923
+ private request;
924
+ }
925
+
926
+ type InitScriptSource<Arg> = string | {
927
+ path?: string;
928
+ content?: string;
929
+ } | ((arg: Arg) => unknown);
930
+ /**
931
+ * V3Context
932
+ *
933
+ * Owns the root CDP connection and wires Target/Page events into Page.
934
+ * Maintains one Page per top-level target, adopts OOPIF child sessions into the owner Page,
935
+ * and tracks target→page and (root) frame→target mappings for lookups.
936
+ *
937
+ * IMPORTANT: FrameId → session ownership is managed inside Page (via its FrameRegistry).
938
+ * Context never “guesses” owners; it simply forwards events (with the emitting session)
939
+ * so Page can record the correct owner at event time.
940
+ */
941
+ declare class V3Context {
942
+ readonly conn: CdpConnection;
943
+ private readonly env;
944
+ private readonly apiClient;
945
+ private readonly localBrowserLaunchOptions;
946
+ private constructor();
947
+ private readonly _piercerInstalled;
948
+ private _lastPopupSignalAt;
949
+ private sessionKey;
950
+ private readonly _sessionInit;
951
+ private pagesByTarget;
952
+ private mainFrameToTarget;
953
+ private sessionOwnerPage;
954
+ private frameOwnerPage;
955
+ private pendingOopifByMainFrame;
956
+ private createdAtByTarget;
957
+ private typeByTarget;
958
+ private _pageOrder;
959
+ private pendingCreatedTargetUrl;
960
+ private readonly initScripts;
961
+ /**
962
+ * Create a Context for a given CDP websocket URL and bootstrap target wiring.
963
+ */
964
+ static create(wsUrl: string, opts?: {
965
+ env?: "LOCAL" | "BROWSERBASE";
966
+ apiClient?: StagehandAPIClient | null;
967
+ localBrowserLaunchOptions?: LocalBrowserLaunchOptions | null;
968
+ }): Promise<V3Context>;
969
+ /**
970
+ * Wait until at least one top-level Page has been created and registered.
971
+ * We poll internal maps that bootstrap/onAttachedToTarget populate.
972
+ */
973
+ private waitForFirstTopLevelPage;
974
+ private waitForInitialTopLevelTargets;
975
+ private ensurePiercer;
976
+ /** Mark a page target as the most-recent one (active). */
977
+ private _pushActive;
978
+ /** Remove a page target from the recency list (used on close). */
979
+ private _removeFromOrder;
980
+ /** Return the current active Page (most-recent page that still exists). */
981
+ activePage(): Page | undefined;
982
+ /** Explicitly mark a known Page as the most-recent active page (and focus it). */
983
+ setActivePage(page: Page): void;
984
+ addInitScript<Arg>(script: InitScriptSource<Arg>, arg?: Arg): Promise<void>;
985
+ /**
986
+ * Return top-level `Page`s (oldest → newest). OOPIF targets are not included.
987
+ */
988
+ pages(): Page[];
989
+ private applyInitScriptsToPage;
990
+ /**
991
+ * Resolve an owning `Page` by the **top-level main frame id**.
992
+ * Note: child (OOPIF) roots are intentionally not present in this mapping.
993
+ */
994
+ resolvePageByMainFrameId(frameId: string): Page | undefined;
995
+ /**
996
+ * Serialize the full frame tree for a given top-level main frame id.
997
+ */
998
+ getFullFrameTreeByMainFrameId(rootMainFrameId: string): Promise<Protocol.Page.FrameTree>;
999
+ /**
1000
+ * Create a new top-level page (tab) with the given URL and return its Page object.
1001
+ * Waits until the target is attached and registered.
1002
+ */
1003
+ newPage(url?: string): Promise<Page>;
1004
+ /**
1005
+ * Close CDP and clear all mappings. Best-effort cleanup.
1006
+ */
1007
+ close(): Promise<void>;
1008
+ /**
1009
+ * Bootstrap target lifecycle:
1010
+ * - Attach to existing targets.
1011
+ * - Attach on `Target.targetCreated` (fallback for OOPIFs).
1012
+ * - Handle auto-attach events.
1013
+ * - Clean up on detach/destroy.
1014
+ */
1015
+ private bootstrap;
1016
+ /**
1017
+ * Handle a newly attached target (top-level or potential OOPIF):
1018
+ * - Enable Page domain and lifecycle events.
1019
+ * - If top-level → create Page, wire listeners, resume.
1020
+ * - Else → probe child root frame id via `Page.getFrameTree` and adopt immediately
1021
+ * if the parent is known; otherwise stage until parent `frameAttached`.
1022
+ * - Resume the target only after listeners are wired.
1023
+ */
1024
+ private onAttachedToTarget;
1025
+ /**
1026
+ * Detach handler:
1027
+ * - Remove child session ownership and prune its subtree.
1028
+ * - If a top-level target, cleanup its `Page` and mappings.
1029
+ * - Drop any staged child for this session.
1030
+ */
1031
+ private onDetachedFromTarget;
1032
+ /**
1033
+ * Cleanup a top-level Page by target id, removing its root and staged children.
1034
+ */
1035
+ private cleanupByTarget;
1036
+ /**
1037
+ * Wire Page-domain frame events for a session into the owning Page & mappings.
1038
+ * We forward the *emitting session* with every event so Page can stamp ownership precisely.
1039
+ */
1040
+ private installFrameEventBridges;
1041
+ /**
1042
+ * Register that a session belongs to a Page (used by event routing).
1043
+ */
1044
+ private wireSessionToOwnerPage;
1045
+ /**
1046
+ * Utility: reverse-lookup the top-level target id that owns a given Page.
1047
+ */
1048
+ private findTargetIdByPage;
1049
+ private _notePopupSignal;
1050
+ /**
1051
+ * Await the current active page, waiting briefly if a popup/open was just triggered.
1052
+ * Normal path returns immediately; popup path waits up to timeoutMs for the new page.
1053
+ */
1054
+ awaitActivePage(timeoutMs?: number): Promise<Page>;
1055
+ }
1056
+
1057
+ type AgentReplayStep = AgentReplayActStep | AgentReplayFillFormStep | AgentReplayGotoStep | AgentReplayScrollStep | AgentReplayWaitStep | AgentReplayNavBackStep | {
1058
+ type: string;
1059
+ [key: string]: unknown;
1060
+ };
1061
+ interface AgentReplayActStep {
1062
+ type: "act";
1063
+ instruction: string;
1064
+ actions?: Action[];
1065
+ actionDescription?: string;
1066
+ message?: string;
1067
+ timeout?: number;
1625
1068
  }
1626
- interface StartSessionResult {
1627
- sessionId: string;
1628
- available?: boolean;
1069
+ interface AgentReplayFillFormStep {
1070
+ type: "fillForm";
1071
+ fields?: Array<{
1072
+ action: string;
1073
+ value: string;
1074
+ }>;
1075
+ observeResults?: Action[];
1076
+ actions?: Action[];
1629
1077
  }
1630
- interface APIActParameters {
1631
- input: string | Action;
1632
- options?: ActOptions;
1633
- frameId?: string;
1078
+ interface AgentReplayGotoStep {
1079
+ type: "goto";
1080
+ url: string;
1081
+ waitUntil?: LoadState;
1634
1082
  }
1635
- interface APIExtractParameters {
1636
- instruction?: string;
1637
- schema?: StagehandZodSchema;
1638
- options?: ExtractOptions;
1639
- frameId?: string;
1083
+ interface AgentReplayScrollStep {
1084
+ type: "scroll";
1085
+ deltaX?: number;
1086
+ deltaY?: number;
1087
+ anchor?: {
1088
+ x: number;
1089
+ y: number;
1090
+ };
1640
1091
  }
1641
- interface APIObserveParameters {
1642
- instruction?: string;
1643
- options?: ObserveOptions;
1644
- frameId?: string;
1092
+ interface AgentReplayWaitStep {
1093
+ type: "wait";
1094
+ timeMs: number;
1645
1095
  }
1646
- interface SerializableResponse {
1647
- requestId: string;
1648
- frameId?: string;
1649
- loaderId?: string;
1650
- response: Protocol.Network.Response;
1651
- fromServiceWorkerFlag?: boolean;
1652
- finishedSettled?: boolean;
1653
- extraInfoHeaders?: Protocol.Network.Headers | null;
1654
- extraInfoHeadersText?: string;
1096
+ interface AgentReplayNavBackStep {
1097
+ type: "navback";
1098
+ waitUntil?: LoadState;
1655
1099
  }
1656
1100
 
1657
1101
  /**
1658
- * Represents a path through a Zod schema from the root object down to a
1659
- * particular field. The `segments` array describes the chain of keys/indices.
1660
- *
1661
- * - **String** segments indicate object property names.
1662
- * - **Number** segments indicate array indices.
1102
+ * Response
1103
+ * -----------------
1663
1104
  *
1664
- * For example, `["users", 0, "homepage"]` might describe reaching
1665
- * the `homepage` field in `schema.users[0].homepage`.
1105
+ * This module implements a Playwright-inspired response wrapper that exposes
1106
+ * navigation metadata and helpers for retrieving HTTP response bodies. The
1107
+ * abstraction is consumed by navigation routines (e.g. `Page.goto`) so callers
1108
+ * can synchronously inspect status codes, lazily fetch body text, or await the
1109
+ * network layer finishing the request. The implementation is built directly on
1110
+ * Chrome DevTools Protocol primitives – it holds the originating `requestId`
1111
+ * so it can request payloads via `Network.getResponseBody`, and it listens for
1112
+ * `responseReceivedExtraInfo`, `loadingFinished`, and `loadingFailed` events to
1113
+ * hydrate the richer header view and resolve callers waiting on completion.
1666
1114
  */
1667
- interface ZodPathSegments {
1115
+
1116
+ type ServerAddr = {
1117
+ ipAddress: string;
1118
+ port: number;
1119
+ };
1120
+ /**
1121
+ * Thin wrapper around CDP response metadata that mirrors the ergonomics of
1122
+ * Playwright's `Response` class. The class intentionally keeps the same method
1123
+ * names so upstream integrations can transition with minimal code changes.
1124
+ */
1125
+ declare class Response$1 {
1126
+ private readonly page;
1127
+ private readonly session;
1128
+ private readonly requestId;
1129
+ private readonly frameId?;
1130
+ private readonly loaderId?;
1131
+ private readonly response;
1132
+ private readonly fromServiceWorkerFlag;
1133
+ private readonly serverAddress?;
1134
+ private headersObject;
1135
+ private headersArrayCache;
1136
+ private allHeadersCache;
1137
+ private readonly headerValuesMap;
1138
+ private finishedDeferred;
1139
+ private finishedSettled;
1140
+ private extraInfoHeaders;
1141
+ private extraInfoHeadersText;
1668
1142
  /**
1669
- * The ordered list of keys/indices leading from the schema root
1670
- * to the targeted field.
1143
+ * Build a response wrapper from the CDP notification associated with a
1144
+ * navigation. The constructor captures the owning page/session so follow-up
1145
+ * methods (body/text/json) can query CDP on-demand. The `response` payload is
1146
+ * the raw `Protocol.Network.Response` object emitted by Chrome.
1671
1147
  */
1672
- segments: Array<string | number>;
1148
+ constructor(params: {
1149
+ page: Page;
1150
+ session: CDPSessionLike;
1151
+ requestId: string;
1152
+ frameId?: string;
1153
+ loaderId?: string;
1154
+ response: Protocol.Network.Response;
1155
+ fromServiceWorker: boolean;
1156
+ });
1157
+ /** URL associated with the navigation request. */
1158
+ url(): string;
1159
+ /** HTTP status code reported by Chrome. */
1160
+ status(): number;
1161
+ /** Human-readable status text that accompanied the response. */
1162
+ statusText(): string;
1163
+ /** Convenience predicate that checks for 2xx statuses. */
1164
+ ok(): boolean;
1165
+ /** Returns the Stagehand frame object that initiated the navigation. */
1166
+ frame(): Frame | null;
1167
+ /** Indicates whether the response was serviced by a Service Worker. */
1168
+ fromServiceWorker(): boolean;
1169
+ /**
1170
+ * Returns TLS security metadata when provided by the browser. In practice
1171
+ * this includes certificate issuer, protocol, and validity interval.
1172
+ */
1173
+ securityDetails(): Promise<Protocol.Network.SecurityDetails | null>;
1174
+ /** Returns the resolved server address for the navigation when available. */
1175
+ serverAddr(): Promise<ServerAddr | null>;
1176
+ /**
1177
+ * Returns the response headers normalised to lowercase keys. Matches the
1178
+ * behaviour of Playwright's `headers()` by eliding duplicate header entries.
1179
+ */
1180
+ headers(): Record<string, string>;
1181
+ /**
1182
+ * Returns all headers including those only surfaced through
1183
+ * `responseReceivedExtraInfo` such as `set-cookie`. Values are reported as the
1184
+ * browser sends them (no further splitting or concatenation).
1185
+ */
1186
+ allHeaders(): Promise<Record<string, string>>;
1187
+ /** Returns a concatenated header string for the supplied header name. */
1188
+ headerValue(name: string): Promise<string | null>;
1189
+ /** Returns all values for a header (case-insensitive lookup). */
1190
+ headerValues(name: string): Promise<string[]>;
1191
+ /**
1192
+ * Returns header entries preserving their original wire casing and ordering.
1193
+ * Falls back to the CDP object when the raw header text is unavailable.
1194
+ */
1195
+ headersArray(): Promise<Array<{
1196
+ name: string;
1197
+ value: string;
1198
+ }>>;
1199
+ /**
1200
+ * Requests the raw response body from Chrome DevTools Protocol. The method is
1201
+ * intentionally lazy because not every caller needs the payload, and CDP only
1202
+ * allows retrieving it once the response completes.
1203
+ */
1204
+ body(): Promise<Buffer>;
1205
+ /** Decodes the response body as UTF-8 text. */
1206
+ text(): Promise<string>;
1207
+ /** Parses the response body as JSON and throws if parsing fails. */
1208
+ json<T = unknown>(): Promise<T>;
1209
+ /**
1210
+ * Resolves once the underlying network request completes or fails. Mirrors
1211
+ * Playwright's behaviour by resolving to `null` on success and to an `Error`
1212
+ * instance when Chrome reports `Network.loadingFailed`.
1213
+ */
1214
+ finished(): Promise<null | Error>;
1215
+ /**
1216
+ * Internal helper invoked by the navigation tracker when CDP reports extra
1217
+ * header information. This keeps the cached header views in sync with the
1218
+ * richer metadata.
1219
+ */
1220
+ applyExtraInfo(event: Protocol.Network.ResponseReceivedExtraInfoEvent): void;
1221
+ /**
1222
+ * Internal helper for creating a Response object from a Serializable
1223
+ * goto response from the Stagehand API
1224
+ */
1225
+ static fromSerializable(serialized: SerializableResponse, context: {
1226
+ page: Page;
1227
+ session: CDPSessionLike;
1228
+ }): Response$1;
1229
+ /** Marks the response as finished and resolves the `finished()` promise. */
1230
+ markFinished(error: Error | null): void;
1231
+ }
1232
+
1233
+ type AnyPage = Page$1 | Page$2 | Page$3 | Page;
1234
+
1235
+ type LoadState = "load" | "domcontentloaded" | "networkidle";
1236
+
1237
+ type ScreenshotAnimationsOption = "disabled" | "allow";
1238
+ type ScreenshotCaretOption = "hide" | "initial";
1239
+ type ScreenshotScaleOption = "css" | "device";
1240
+ interface ScreenshotClip {
1241
+ x: number;
1242
+ y: number;
1243
+ width: number;
1244
+ height: number;
1245
+ }
1246
+ interface ScreenshotOptions {
1247
+ animations?: ScreenshotAnimationsOption;
1248
+ caret?: ScreenshotCaretOption;
1249
+ clip?: ScreenshotClip;
1250
+ fullPage?: boolean;
1251
+ mask?: Locator[];
1252
+ maskColor?: string;
1253
+ omitBackground?: boolean;
1254
+ path?: string;
1255
+ quality?: number;
1256
+ scale?: ScreenshotScaleOption;
1257
+ style?: string;
1258
+ timeout?: number;
1259
+ type?: "png" | "jpeg";
1673
1260
  }
1674
1261
 
1675
- type EvaluateOptions = {
1676
- /** The question to ask about the task state */
1677
- question: string;
1678
- /** The answer to the question */
1679
- answer?: string;
1680
- /** Whether to take a screenshot of the task state, or array of screenshots to evaluate */
1681
- screenshot?: boolean | Buffer[];
1682
- /** Custom system prompt for the evaluator */
1683
- systemPrompt?: string;
1684
- /** Delay in milliseconds before taking the screenshot @default 250 */
1685
- screenshotDelayMs?: number;
1686
- /** The agent's reasoning/thought process for completing the task */
1687
- agentReasoning?: string;
1688
- };
1689
- type BatchAskOptions = {
1690
- /** Array of questions with optional answers */
1691
- questions: Array<{
1692
- question: string;
1693
- answer?: string;
1694
- }>;
1695
- /** Whether to take a screenshot of the task state */
1696
- screenshot?: boolean;
1697
- /** Custom system prompt for the evaluator */
1698
- systemPrompt?: string;
1699
- /** Delay in milliseconds before taking the screenshot @default 1000 */
1700
- screenshotDelayMs?: number;
1701
- };
1702
- /**
1703
- * Result of an evaluation
1704
- */
1705
- interface EvaluationResult {
1262
+ declare class Page {
1263
+ private readonly conn;
1264
+ private readonly mainSession;
1265
+ private readonly _targetId;
1266
+ /** Every CDP child session this page owns (top-level + adopted OOPIF sessions). */
1267
+ private readonly sessions;
1268
+ /** Unified truth for frame topology + ownership. */
1269
+ private readonly registry;
1270
+ /** A convenience wrapper bound to the current main frame id (top-level session). */
1271
+ private mainFrameWrapper;
1272
+ /** Compact ordinal per frameId (used by snapshot encoding). */
1273
+ private frameOrdinals;
1274
+ private nextOrdinal;
1275
+ /** cache Frames per frameId so everyone uses the same one */
1276
+ private readonly frameCache;
1277
+ private readonly browserIsRemote;
1278
+ /** Stable id for Frames created by this Page (use top-level TargetId). */
1279
+ private readonly pageId;
1280
+ /** Cached current URL for synchronous page.url() */
1281
+ private _currentUrl;
1282
+ private navigationCommandSeq;
1283
+ private latestNavigationCommandId;
1284
+ private readonly networkManager;
1285
+ /** Optional API client for routing page operations to the API */
1286
+ private readonly apiClient;
1287
+ private readonly consoleListeners;
1288
+ private readonly consoleHandlers;
1289
+ /** Document-start scripts installed across every session this page owns. */
1290
+ private readonly initScripts;
1291
+ private constructor();
1292
+ private installInitScriptOnSession;
1293
+ private applyInitScriptsToSession;
1294
+ registerInitScript(source: string): Promise<void>;
1295
+ private cursorEnabled;
1296
+ private ensureCursorScript;
1297
+ enableCursorOverlay(): Promise<void>;
1298
+ private updateCursor;
1706
1299
  /**
1707
- * The evaluation result ('YES', 'NO', or 'INVALID' if parsing failed or value was unexpected)
1300
+ * Factory: create Page and seed registry with the shallow tree from Page.getFrameTree.
1301
+ * Assumes Page domain is already enabled on the session passed in.
1302
+ */
1303
+ static create(conn: CdpConnection, session: CDPSessionLike, targetId: string, apiClient?: StagehandAPIClient | null, localBrowserLaunchOptions?: LocalBrowserLaunchOptions | null, browserIsRemote?: boolean): Promise<Page>;
1304
+ /**
1305
+ * Parent/child session emitted a `frameAttached`.
1306
+ * Topology update + ownership stamped to **emitting session**.
1307
+ */
1308
+ onFrameAttached(frameId: string, parentId: string | null, session: CDPSessionLike): void;
1309
+ /**
1310
+ * Parent/child session emitted a `frameDetached`.
1311
+ */
1312
+ onFrameDetached(frameId: string, reason?: "remove" | "swap" | string): void;
1313
+ /**
1314
+ * Parent/child session emitted a `frameNavigated`.
1315
+ * Topology + ownership update. Handles root swaps.
1316
+ */
1317
+ onFrameNavigated(frame: Protocol.Page.Frame, session: CDPSessionLike): void;
1318
+ onNavigatedWithinDocument(frameId: string, url: string, session: CDPSessionLike): void;
1319
+ /**
1320
+ * An OOPIF child session whose **main** frame id equals the parent iframe’s frameId
1321
+ * has been attached; adopt the session into this Page and seed ownership for its subtree.
1322
+ */
1323
+ adoptOopifSession(childSession: CDPSessionLike, childMainFrameId: string): void;
1324
+ /** Detach an adopted child session and prune its subtree */
1325
+ detachOopifSession(sessionId: string): void;
1326
+ /** Return the owning CDP session for a frameId (falls back to main session) */
1327
+ getSessionForFrame(frameId: string): CDPSessionLike;
1328
+ /** Always returns a Frame bound to the owning session */
1329
+ frameForId(frameId: string): Frame;
1330
+ /** Expose a session by id (used by snapshot to resolve session id -> session) */
1331
+ getSessionById(id: string): CDPSessionLike | undefined;
1332
+ registerSessionForNetwork(session: CDPSessionLike): void;
1333
+ unregisterSessionForNetwork(sessionId: string | undefined): void;
1334
+ on(event: "console", listener: ConsoleListener): Page;
1335
+ once(event: "console", listener: ConsoleListener): Page;
1336
+ off(event: "console", listener: ConsoleListener): Page;
1337
+ targetId(): string;
1338
+ /**
1339
+ * Send a CDP command through the main session.
1340
+ * Allows external consumers to execute arbitrary Chrome DevTools Protocol commands.
1341
+ *
1342
+ * @param method - The CDP method name (e.g., "Page.enable", "Runtime.evaluate")
1343
+ * @param params - Optional parameters for the CDP command
1344
+ * @returns Promise resolving to the typed CDP response
1345
+ *
1346
+ * @example
1347
+ * // Enable the Runtime domain
1348
+ * await page.sendCDP("Runtime.enable");
1349
+ *
1350
+ * @example
1351
+ * // Evaluate JavaScript with typed response
1352
+ * const result = await page.sendCDP<Protocol.Runtime.EvaluateResponse>(
1353
+ * "Runtime.evaluate",
1354
+ * { expression: "1 + 1" }
1355
+ * );
1356
+ */
1357
+ sendCDP<T = unknown>(method: string, params?: object): Promise<T>;
1358
+ /** Seed the cached URL before navigation events converge. */
1359
+ seedCurrentUrl(url: string | undefined | null): void;
1360
+ mainFrameId(): string;
1361
+ mainFrame(): Frame;
1362
+ /**
1363
+ * Close this top-level page (tab). Best-effort via Target.closeTarget.
1364
+ */
1365
+ close(): Promise<void>;
1366
+ getFullFrameTree(): Protocol.Page.FrameTree;
1367
+ asProtocolFrameTree(rootMainFrameId: string): Protocol.Page.FrameTree;
1368
+ private ensureOrdinal;
1369
+ /** Public getter for snapshot code / handlers. */
1370
+ getOrdinal(frameId: string): number;
1371
+ listAllFrameIds(): string[];
1372
+ private ensureConsoleTaps;
1373
+ private installConsoleTap;
1374
+ private sessionKey;
1375
+ private resolveSessionByKey;
1376
+ private teardownConsoleTap;
1377
+ private removeAllConsoleTaps;
1378
+ private emitConsole;
1379
+ /**
1380
+ * Navigate the page; optionally wait for a lifecycle state.
1381
+ * Waits on the **current** main frame and follows root swaps during navigation.
1382
+ */
1383
+ goto(url: string, options?: {
1384
+ waitUntil?: LoadState;
1385
+ timeoutMs?: number;
1386
+ }): Promise<Response$1 | null>;
1387
+ /**
1388
+ * Reload the page; optionally wait for a lifecycle state.
1389
+ */
1390
+ reload(options?: {
1391
+ waitUntil?: LoadState;
1392
+ timeoutMs?: number;
1393
+ ignoreCache?: boolean;
1394
+ }): Promise<Response$1 | null>;
1395
+ /**
1396
+ * Navigate back in history if possible; optionally wait for a lifecycle state.
1397
+ */
1398
+ goBack(options?: {
1399
+ waitUntil?: LoadState;
1400
+ timeoutMs?: number;
1401
+ }): Promise<Response$1 | null>;
1402
+ /**
1403
+ * Navigate forward in history if possible; optionally wait for a lifecycle state.
1404
+ */
1405
+ goForward(options?: {
1406
+ waitUntil?: LoadState;
1407
+ timeoutMs?: number;
1408
+ }): Promise<Response$1 | null>;
1409
+ /**
1410
+ * Return the current page URL (synchronous, cached from navigation events).
1411
+ */
1412
+ url(): string;
1413
+ private beginNavigationCommand;
1414
+ isCurrentNavigationCommand(id: number): boolean;
1415
+ /**
1416
+ * Return the current page title.
1417
+ * Prefers reading from the active document via Runtime.evaluate to reflect dynamic changes.
1418
+ * Falls back to navigation history title if evaluation is unavailable.
1419
+ */
1420
+ title(): Promise<string>;
1421
+ /**
1422
+ * Capture a screenshot with Playwright-style options.
1423
+ *
1424
+ * @param options Optional screenshot configuration.
1425
+ * @param options.animations Control CSS/Web animations during capture. Use
1426
+ * "disabled" to fast-forward finite animations and pause infinite ones.
1427
+ * @param options.caret Either hide the text caret (default) or leave it
1428
+ * visible via "initial".
1429
+ * @param options.clip Restrict capture to a specific rectangle (in CSS
1430
+ * pixels). Cannot be combined with `fullPage`.
1431
+ * @param options.fullPage Capture the full scrollable page instead of the
1432
+ * current viewport.
1433
+ * @param options.mask Array of locators that should be covered with an
1434
+ * overlay while the screenshot is taken.
1435
+ * @param options.maskColor CSS color used for the mask overlay (default
1436
+ * `#FF00FF`).
1437
+ * @param options.omitBackground Make the default page background transparent
1438
+ * (PNG only).
1439
+ * @param options.path File path to write the screenshot to. The file extension
1440
+ * determines the image type when `type` is not explicitly provided.
1441
+ * @param options.quality JPEG quality (0–100). Only applies when
1442
+ * `type === "jpeg"`.
1443
+ * @param options.scale Render scale: use "css" for one pixel per CSS pixel,
1444
+ * otherwise the default "device" leverages the current device pixel ratio.
1445
+ * @param options.style Additional CSS text injected into every frame before
1446
+ * capture (removed afterwards).
1447
+ * @param options.timeout Maximum capture duration in milliseconds before a
1448
+ * timeout error is thrown.
1449
+ * @param options.type Image format (`"png"` by default).
1450
+ */
1451
+ screenshot(options?: ScreenshotOptions): Promise<Buffer>;
1452
+ /**
1453
+ * Create a locator bound to the current main frame.
1454
+ */
1455
+ locator(selector: string): ReturnType<Frame["locator"]>;
1456
+ /**
1457
+ * Deep locator that supports cross-iframe traversal.
1458
+ * - Recognizes '>>' hop notation to enter iframe contexts.
1459
+ * - Supports deep XPath that includes iframe steps (e.g., '/html/body/iframe[2]//div').
1460
+ * Returns a Locator scoped to the appropriate frame.
1461
+ */
1462
+ deepLocator(selector: string): DeepLocatorDelegate;
1463
+ /**
1464
+ * Frame locator similar to Playwright: targets iframe elements and scopes
1465
+ * subsequent locators to that frame. Supports chaining.
1466
+ */
1467
+ frameLocator(selector: string): FrameLocator;
1468
+ /**
1469
+ * List all frames belonging to this page as Frame objects bound to their owning sessions.
1470
+ * The list is ordered by a stable ordinal assigned during the page lifetime.
1471
+ */
1472
+ frames(): Frame[];
1473
+ /**
1474
+ * Wait until the page reaches a lifecycle state on the current main frame.
1475
+ * Mirrors Playwright's API signatures.
1476
+ */
1477
+ waitForLoadState(state: LoadState, timeoutMs?: number): Promise<void>;
1478
+ /**
1479
+ * Evaluate a function or expression in the current main frame's isolated world.
1480
+ * - If a string is provided, it is treated as a JS expression.
1481
+ * - If a function is provided, it is stringified and invoked with the optional argument.
1482
+ * - The return value should be JSON-serializable. Non-serializable objects will
1483
+ * best-effort serialize via JSON.stringify inside the page context.
1484
+ */
1485
+ evaluate<R = unknown, Arg = unknown>(pageFunctionOrExpression: string | ((arg: Arg) => R | Promise<R>), arg?: Arg): Promise<R>;
1486
+ /**
1487
+ * Force the page viewport to an exact CSS size and device scale factor.
1488
+ * Ensures screenshots match width x height pixels when deviceScaleFactor = 1.
1708
1489
  */
1709
- evaluation: "YES" | "NO" | "INVALID";
1490
+ setViewportSize(width: number, height: number, options?: {
1491
+ deviceScaleFactor?: number;
1492
+ }): Promise<void>;
1710
1493
  /**
1711
- * The reasoning behind the evaluation
1494
+ * Click at absolute page coordinates (CSS pixels).
1495
+ * Dispatches mouseMoved → mousePressed → mouseReleased via CDP Input domain
1496
+ * on the top-level page target's session. Coordinates are relative to the
1497
+ * viewport origin (top-left). Does not scroll.
1712
1498
  */
1713
- reasoning: string;
1714
- }
1715
-
1716
- /**
1717
- * V3Context
1718
- *
1719
- * Owns the root CDP connection and wires Target/Page events into Page.
1720
- * Maintains one Page per top-level target, adopts OOPIF child sessions into the owner Page,
1721
- * and tracks target→page and (root) frame→target mappings for lookups.
1722
- *
1723
- * IMPORTANT: FrameId session ownership is managed inside Page (via its FrameRegistry).
1724
- * Context never “guesses” owners; it simply forwards events (with the emitting session)
1725
- * so Page can record the correct owner at event time.
1726
- */
1727
- declare class V3Context {
1728
- readonly conn: CdpConnection;
1729
- private readonly env;
1730
- private readonly apiClient;
1731
- private readonly localBrowserLaunchOptions;
1732
- private constructor();
1733
- private readonly _piercerInstalled;
1734
- private _lastPopupSignalAt;
1735
- private sessionKey;
1736
- private readonly _sessionInit;
1737
- private pagesByTarget;
1738
- private mainFrameToTarget;
1739
- private sessionOwnerPage;
1740
- private frameOwnerPage;
1741
- private pendingOopifByMainFrame;
1742
- private createdAtByTarget;
1743
- private typeByTarget;
1744
- private _pageOrder;
1745
- private pendingCreatedTargetUrl;
1499
+ click(x: number, y: number, options: {
1500
+ button?: "left" | "right" | "middle";
1501
+ clickCount?: number;
1502
+ returnXpath: true;
1503
+ }): Promise<string>;
1504
+ click(x: number, y: number, options?: {
1505
+ button?: "left" | "right" | "middle";
1506
+ clickCount?: number;
1507
+ returnXpath?: false;
1508
+ }): Promise<void>;
1509
+ click(x: number, y: number, options: {
1510
+ button?: "left" | "right" | "middle";
1511
+ clickCount?: number;
1512
+ returnXpath: boolean;
1513
+ }): Promise<void | string>;
1514
+ scroll(x: number, y: number, deltaX: number, deltaY: number, options: {
1515
+ returnXpath: true;
1516
+ }): Promise<string>;
1517
+ scroll(x: number, y: number, deltaX: number, deltaY: number, options?: {
1518
+ returnXpath?: false;
1519
+ }): Promise<void>;
1520
+ scroll(x: number, y: number, deltaX: number, deltaY: number, options: {
1521
+ returnXpath: boolean;
1522
+ }): Promise<void | string>;
1746
1523
  /**
1747
- * Create a Context for a given CDP websocket URL and bootstrap target wiring.
1524
+ * Drag from (fromX, fromY) to (toX, toY) using mouse events.
1525
+ * Sends mouseMoved → mousePressed → mouseMoved (steps) → mouseReleased.
1748
1526
  */
1749
- static create(wsUrl: string, opts?: {
1750
- env?: "LOCAL" | "BROWSERBASE";
1751
- apiClient?: StagehandAPIClient | null;
1752
- localBrowserLaunchOptions?: LocalBrowserLaunchOptions | null;
1753
- }): Promise<V3Context>;
1527
+ dragAndDrop(fromX: number, fromY: number, toX: number, toY: number, options: {
1528
+ button?: "left" | "right" | "middle";
1529
+ steps?: number;
1530
+ delay?: number;
1531
+ returnXpath: true;
1532
+ }): Promise<[string, string]>;
1533
+ dragAndDrop(fromX: number, fromY: number, toX: number, toY: number, options?: {
1534
+ button?: "left" | "right" | "middle";
1535
+ steps?: number;
1536
+ delay?: number;
1537
+ returnXpath?: false;
1538
+ }): Promise<void>;
1539
+ dragAndDrop(fromX: number, fromY: number, toX: number, toY: number, options: {
1540
+ button?: "left" | "right" | "middle";
1541
+ steps?: number;
1542
+ delay?: number;
1543
+ returnXpath: boolean;
1544
+ }): Promise<void | [string, string]>;
1754
1545
  /**
1755
- * Wait until at least one top-level Page has been created and registered.
1756
- * We poll internal maps that bootstrap/onAttachedToTarget populate.
1546
+ * Type a string by dispatching keyDown/keyUp events per character.
1547
+ * Focus must already be on the desired element. Uses CDP Input.dispatchKeyEvent
1548
+ * and never falls back to Input.insertText. Optional delay applies between
1549
+ * successive characters.
1757
1550
  */
1758
- private waitForFirstTopLevelPage;
1759
- private waitForInitialTopLevelTargets;
1760
- private ensurePiercer;
1761
- /** Mark a page target as the most-recent one (active). */
1762
- private _pushActive;
1763
- /** Remove a page target from the recency list (used on close). */
1764
- private _removeFromOrder;
1765
- /** Return the current active Page (most-recent page that still exists). */
1766
- activePage(): Page | undefined;
1767
- /** Explicitly mark a known Page as the most-recent active page (and focus it). */
1768
- setActivePage(page: Page): void;
1551
+ type(text: string, options?: {
1552
+ delay?: number;
1553
+ withMistakes?: boolean;
1554
+ }): Promise<void>;
1769
1555
  /**
1770
- * Return top-level `Page`s (oldest newest). OOPIF targets are not included.
1556
+ * Press a single key or key combination (keyDown then keyUp).
1557
+ * For printable characters, uses the text path on keyDown; for named keys, sets key/code/VK.
1558
+ * Supports key combinations with modifiers like "Cmd+A", "Ctrl+C", "Shift+Tab", etc.
1771
1559
  */
1772
- pages(): Page[];
1560
+ keyPress(key: string, options?: {
1561
+ delay?: number;
1562
+ }): Promise<void>;
1563
+ private _pressedModifiers;
1564
+ /** Press a key down without releasing it */
1565
+ private keyDown;
1566
+ /** Release a pressed key */
1567
+ private keyUp;
1568
+ /** Normalize modifier key names to match CDP expectations */
1569
+ private normalizeModifierKey;
1773
1570
  /**
1774
- * Resolve an owning `Page` by the **top-level main frame id**.
1775
- * Note: child (OOPIF) roots are intentionally not present in this mapping.
1571
+ * Get the map of named keys with their properties
1776
1572
  */
1777
- resolvePageByMainFrameId(frameId: string): Page | undefined;
1573
+ private getNamedKeys;
1778
1574
  /**
1779
- * Serialize the full frame tree for a given top-level main frame id.
1575
+ * Minimal description for printable keys (letters/digits/space) to provide code and VK.
1576
+ * Used when non-Shift modifiers are pressed to avoid sending text while keeping accelerator info.
1780
1577
  */
1781
- getFullFrameTreeByMainFrameId(rootMainFrameId: string): Promise<Protocol.Page.FrameTree>;
1578
+ private describePrintableKey;
1579
+ private isMacOS;
1782
1580
  /**
1783
- * Create a new top-level page (tab) with the given URL and return its Page object.
1784
- * Waits until the target is attached and registered.
1581
+ * Return Chromium mac editing commands (without trailing ':') for a given code like 'KeyA'
1582
+ * Only used on macOS to trigger system editing shortcuts (e.g., selectAll, copy, paste...).
1785
1583
  */
1786
- newPage(url?: string): Promise<Page>;
1584
+ private macCommandsFor;
1787
1585
  /**
1788
- * Close CDP and clear all mappings. Best-effort cleanup.
1586
+ * Create an isolated world for the **current** main frame and return its context id.
1789
1587
  */
1790
- close(): Promise<void>;
1588
+ private createIsolatedWorldForCurrentMain;
1791
1589
  /**
1792
- * Bootstrap target lifecycle:
1793
- * - Attach to existing targets.
1794
- * - Attach on `Target.targetCreated` (fallback for OOPIFs).
1795
- * - Handle auto-attach events.
1796
- * - Clean up on detach/destroy.
1590
+ * Wait until the **current** main frame reaches a lifecycle state.
1591
+ * - Fast path via `document.readyState`.
1592
+ * - Event path listens at the session level and compares incoming `frameId`
1593
+ * to `mainFrameId()` **at event time** to follow root swaps.
1797
1594
  */
1798
- private bootstrap;
1595
+ waitForMainLoadState(state: LoadState, timeoutMs?: number): Promise<void>;
1596
+ }
1597
+
1598
+ interface AgentContext {
1599
+ options: AgentExecuteOptions;
1600
+ maxSteps: number;
1601
+ systemPrompt: string;
1602
+ allTools: ToolSet;
1603
+ messages: ModelMessage[];
1604
+ wrappedModel: ReturnType<typeof wrapLanguageModel>;
1605
+ initialPageUrl: string;
1606
+ }
1607
+ interface AgentState {
1608
+ collectedReasoning: string[];
1609
+ actions: AgentAction[];
1610
+ finalMessage: string;
1611
+ completed: boolean;
1612
+ currentPageUrl: string;
1613
+ }
1614
+ interface AgentAction {
1615
+ type: string;
1616
+ reasoning?: string;
1617
+ taskCompleted?: boolean;
1618
+ action?: string;
1619
+ timeMs?: number;
1620
+ pageText?: string;
1621
+ pageUrl?: string;
1622
+ instruction?: string;
1623
+ [key: string]: unknown;
1624
+ }
1625
+ interface AgentResult {
1626
+ success: boolean;
1627
+ message: string;
1628
+ actions: AgentAction[];
1629
+ completed: boolean;
1630
+ metadata?: Record<string, unknown>;
1631
+ usage?: {
1632
+ input_tokens: number;
1633
+ output_tokens: number;
1634
+ reasoning_tokens?: number;
1635
+ cached_input_tokens?: number;
1636
+ inference_time_ms: number;
1637
+ };
1638
+ }
1639
+ type AgentStreamResult = StreamTextResult<ToolSet, never> & {
1640
+ result: Promise<AgentResult>;
1641
+ };
1642
+ interface AgentExecuteOptions {
1643
+ instruction: string;
1644
+ maxSteps?: number;
1645
+ page?: Page$1 | Page$2 | Page$3 | Page;
1646
+ highlightCursor?: boolean;
1647
+ }
1648
+ type AgentType = "openai" | "anthropic" | "google" | "microsoft";
1649
+ declare const AVAILABLE_CUA_MODELS: readonly ["openai/computer-use-preview", "openai/computer-use-preview-2025-03-11", "anthropic/claude-3-7-sonnet-latest", "anthropic/claude-haiku-4-5-20251001", "anthropic/claude-sonnet-4-20250514", "anthropic/claude-sonnet-4-5-20250929", "google/gemini-2.5-computer-use-preview-10-2025", "microsoft/fara-7b"];
1650
+ type AvailableCuaModel = (typeof AVAILABLE_CUA_MODELS)[number];
1651
+ interface AgentExecutionOptions<TOptions extends AgentExecuteOptions = AgentExecuteOptions> {
1652
+ options: TOptions;
1653
+ logger: (message: LogLine) => void;
1654
+ retries?: number;
1655
+ }
1656
+ interface AgentHandlerOptions {
1657
+ modelName: string;
1658
+ clientOptions?: ClientOptions;
1659
+ userProvidedInstructions?: string;
1660
+ experimental?: boolean;
1661
+ }
1662
+ interface ActionExecutionResult {
1663
+ success: boolean;
1664
+ error?: string;
1665
+ data?: unknown;
1666
+ }
1667
+ interface ToolUseItem extends ResponseItem {
1668
+ type: "tool_use";
1669
+ id: string;
1670
+ name: string;
1671
+ input: Record<string, unknown>;
1672
+ }
1673
+ interface AnthropicMessage {
1674
+ role: string;
1675
+ content: string | Array<AnthropicContentBlock>;
1676
+ }
1677
+ interface AnthropicContentBlock {
1678
+ type: string;
1679
+ [key: string]: unknown;
1680
+ }
1681
+ interface AnthropicTextBlock extends AnthropicContentBlock {
1682
+ type: "text";
1683
+ text: string;
1684
+ }
1685
+ interface AnthropicToolResult {
1686
+ type: "tool_result";
1687
+ tool_use_id: string;
1688
+ content: string | Array<AnthropicContentBlock>;
1689
+ }
1690
+ interface ResponseItem {
1691
+ type: string;
1692
+ id: string;
1693
+ [key: string]: unknown;
1694
+ }
1695
+ interface ComputerCallItem extends ResponseItem {
1696
+ type: "computer_call";
1697
+ call_id: string;
1698
+ action: {
1699
+ type: string;
1700
+ [key: string]: unknown;
1701
+ };
1702
+ pending_safety_checks?: Array<{
1703
+ id: string;
1704
+ code: string;
1705
+ message: string;
1706
+ }>;
1707
+ }
1708
+ interface FunctionCallItem extends ResponseItem {
1709
+ type: "function_call";
1710
+ call_id: string;
1711
+ name: string;
1712
+ arguments: string;
1713
+ }
1714
+ type ResponseInputItem = {
1715
+ role: string;
1716
+ content: string;
1717
+ } | {
1718
+ type: "computer_call_output";
1719
+ call_id: string;
1720
+ output: {
1721
+ type: "input_image";
1722
+ image_url: string;
1723
+ current_url?: string;
1724
+ error?: string;
1725
+ [key: string]: unknown;
1726
+ } | string;
1727
+ acknowledged_safety_checks?: Array<{
1728
+ id: string;
1729
+ code: string;
1730
+ message: string;
1731
+ }>;
1732
+ } | {
1733
+ type: "function_call_output";
1734
+ call_id: string;
1735
+ output: string;
1736
+ };
1737
+ interface AgentInstance {
1738
+ execute: (instructionOrOptions: string | AgentExecuteOptions) => Promise<AgentResult>;
1739
+ }
1740
+ type AgentProviderType = AgentType;
1741
+ type AgentModelConfig<TModelName extends string = string> = {
1742
+ modelName: TModelName;
1743
+ } & Record<string, unknown>;
1744
+ type AgentConfig = {
1799
1745
  /**
1800
- * Handle a newly attached target (top-level or potential OOPIF):
1801
- * - Enable Page domain and lifecycle events.
1802
- * - If top-level → create Page, wire listeners, resume.
1803
- * - Else → probe child root frame id via `Page.getFrameTree` and adopt immediately
1804
- * if the parent is known; otherwise stage until parent `frameAttached`.
1805
- * - Resume the target only after listeners are wired.
1746
+ * Custom system prompt to provide to the agent. Overrides the default system prompt.
1806
1747
  */
1807
- private onAttachedToTarget;
1748
+ systemPrompt?: string;
1808
1749
  /**
1809
- * Detach handler:
1810
- * - Remove child session ownership and prune its subtree.
1811
- * - If a top-level target, cleanup its `Page` and mappings.
1812
- * - Drop any staged child for this session.
1750
+ * MCP integrations - Array of Client objects
1813
1751
  */
1814
- private onDetachedFromTarget;
1752
+ integrations?: (Client | string)[];
1815
1753
  /**
1816
- * Cleanup a top-level Page by target id, removing its root and staged children.
1754
+ * Tools passed to the agent client
1817
1755
  */
1818
- private cleanupByTarget;
1756
+ tools?: ToolSet;
1819
1757
  /**
1820
- * Wire Page-domain frame events for a session into the owning Page & mappings.
1821
- * We forward the *emitting session* with every event so Page can stamp ownership precisely.
1758
+ * Indicates CUA is disabled for this configuration
1822
1759
  */
1823
- private installFrameEventBridges;
1760
+ cua?: boolean;
1824
1761
  /**
1825
- * Register that a session belongs to a Page (used by event routing).
1762
+ * The model to use for agent functionality
1826
1763
  */
1827
- private wireSessionToOwnerPage;
1764
+ model?: string | AgentModelConfig<string>;
1828
1765
  /**
1829
- * Utility: reverse-lookup the top-level target id that owns a given Page.
1766
+ * The model to use for tool execution (observe/act calls within agent tools).
1767
+ * If not specified, inherits from the main model configuration.
1768
+ * Format: "provider/model" (e.g., "openai/gpt-4o-mini", "google/gemini-2.0-flash-exp")
1830
1769
  */
1831
- private findTargetIdByPage;
1832
- private _notePopupSignal;
1770
+ executionModel?: string | AgentModelConfig<string>;
1833
1771
  /**
1834
- * Await the current active page, waiting briefly if a popup/open was just triggered.
1835
- * Normal path returns immediately; popup path waits up to timeoutMs for the new page.
1772
+ * Enable streaming mode for the agent.
1773
+ * When true, execute() returns AgentStreamResult with textStream for incremental output.
1774
+ * When false (default), execute() returns AgentResult after completion.
1836
1775
  */
1837
- awaitActivePage(timeoutMs?: number): Promise<Page>;
1776
+ stream?: boolean;
1777
+ };
1778
+ /**
1779
+ * Agent instance returned when stream: true is set in AgentConfig.
1780
+ * execute() returns a streaming result that can be consumed incrementally.
1781
+ */
1782
+ interface StreamingAgentInstance {
1783
+ execute: (instructionOrOptions: string | AgentExecuteOptions) => Promise<AgentStreamResult>;
1784
+ }
1785
+ /**
1786
+ * Agent instance returned when stream is false or not set in AgentConfig.
1787
+ * execute() returns a result after the agent completes.
1788
+ */
1789
+ interface NonStreamingAgentInstance {
1790
+ execute: (instructionOrOptions: string | AgentExecuteOptions) => Promise<AgentResult>;
1838
1791
  }
1839
1792
 
1840
- type AgentReplayStep = AgentReplayActStep | AgentReplayFillFormStep | AgentReplayGotoStep | AgentReplayScrollStep | AgentReplayWaitStep | AgentReplayNavBackStep | {
1841
- type: string;
1842
- [key: string]: unknown;
1793
+ type OpenAIClientOptions = Pick<ClientOptions$1, "baseURL" | "apiKey">;
1794
+ type AnthropicClientOptions = Pick<ClientOptions$2, "baseURL" | "apiKey">;
1795
+ interface GoogleServiceAccountCredentials {
1796
+ type?: string;
1797
+ project_id?: string;
1798
+ private_key_id?: string;
1799
+ private_key?: string;
1800
+ client_email?: string;
1801
+ client_id?: string;
1802
+ auth_uri?: string;
1803
+ token_uri?: string;
1804
+ auth_provider_x509_cert_url?: string;
1805
+ client_x509_cert_url?: string;
1806
+ universe_domain?: string;
1807
+ }
1808
+ type GoogleVertexProviderSettings = Pick<GoogleVertexProviderSettings$1, "project" | "location"> & {
1809
+ googleAuthOptions?: {
1810
+ credentials?: GoogleServiceAccountCredentials;
1811
+ };
1843
1812
  };
1844
- interface AgentReplayActStep {
1845
- type: "act";
1846
- instruction: string;
1847
- actions?: Action[];
1848
- actionDescription?: string;
1849
- message?: string;
1850
- timeout?: number;
1813
+ type AnthropicJsonSchemaObject = {
1814
+ definitions?: {
1815
+ MySchema?: {
1816
+ properties?: Record<string, unknown>;
1817
+ required?: string[];
1818
+ };
1819
+ };
1820
+ properties?: Record<string, unknown>;
1821
+ required?: string[];
1822
+ } & Record<string, unknown>;
1823
+ interface LLMTool {
1824
+ type: "function";
1825
+ name: string;
1826
+ description: string;
1827
+ parameters: Record<string, unknown>;
1851
1828
  }
1852
- interface AgentReplayFillFormStep {
1853
- type: "fillForm";
1854
- fields?: Array<{
1855
- action: string;
1856
- value: string;
1857
- }>;
1858
- observeResults?: Action[];
1859
- actions?: Action[];
1829
+ type AISDKProvider = (modelName: string) => LanguageModelV2;
1830
+ type AISDKCustomProvider = (options: ClientOptions) => AISDKProvider;
1831
+ type AvailableModel = "gpt-4.1" | "gpt-4.1-mini" | "gpt-4.1-nano" | "o4-mini" | "o3" | "o3-mini" | "o1" | "o1-mini" | "gpt-4o" | "gpt-4o-mini" | "gpt-4o-2024-08-06" | "gpt-4.5-preview" | "o1-preview" | "claude-3-5-sonnet-latest" | "claude-3-5-sonnet-20241022" | "claude-3-5-sonnet-20240620" | "claude-3-7-sonnet-latest" | "claude-3-7-sonnet-20250219" | "cerebras-llama-3.3-70b" | "cerebras-llama-3.1-8b" | "groq-llama-3.3-70b-versatile" | "groq-llama-3.3-70b-specdec" | "gemini-1.5-flash" | "gemini-1.5-pro" | "gemini-1.5-flash-8b" | "gemini-2.0-flash-lite" | "gemini-2.0-flash" | "gemini-2.5-flash-preview-04-17" | "gemini-2.5-pro-preview-03-25" | string;
1832
+ type ModelProvider = "openai" | "anthropic" | "cerebras" | "groq" | "google" | "aisdk";
1833
+ type ClientOptions = (OpenAIClientOptions | AnthropicClientOptions | GoogleVertexProviderSettings) & {
1834
+ apiKey?: string;
1835
+ provider?: AgentProviderType;
1836
+ baseURL?: string;
1837
+ /** OpenAI organization ID */
1838
+ organization?: string;
1839
+ /** Delay between agent actions in ms */
1840
+ waitBetweenActions?: number;
1841
+ /** Anthropic thinking budget for extended thinking */
1842
+ thinkingBudget?: number;
1843
+ /** Environment type for CUA agents (browser, mac, windows, ubuntu) */
1844
+ environment?: string;
1845
+ /** Max images for Microsoft FARA agent */
1846
+ maxImages?: number;
1847
+ /** Temperature for model inference */
1848
+ temperature?: number;
1849
+ };
1850
+ type ModelConfiguration = AvailableModel | (ClientOptions & {
1851
+ modelName: AvailableModel;
1852
+ });
1853
+
1854
+ interface ChatMessage {
1855
+ role: "system" | "user" | "assistant";
1856
+ content: ChatMessageContent;
1860
1857
  }
1861
- interface AgentReplayGotoStep {
1862
- type: "goto";
1863
- url: string;
1864
- waitUntil?: LoadState;
1858
+ type ChatMessageContent = string | (ChatMessageImageContent | ChatMessageTextContent)[];
1859
+ interface ChatMessageImageContent {
1860
+ type: string;
1861
+ image_url?: {
1862
+ url: string;
1863
+ };
1864
+ text?: string;
1865
+ source?: {
1866
+ type: string;
1867
+ media_type: string;
1868
+ data: string;
1869
+ };
1865
1870
  }
1866
- interface AgentReplayScrollStep {
1867
- type: "scroll";
1868
- deltaX?: number;
1869
- deltaY?: number;
1870
- anchor?: {
1871
- x: number;
1872
- y: number;
1871
+ interface ChatMessageTextContent {
1872
+ type: string;
1873
+ text: string;
1874
+ }
1875
+ declare const AnnotatedScreenshotText = "This is a screenshot of the current page state with the elements annotated on it. Each element id is annotated with a number to the top left of it. Duplicate annotations at the same location are under each other vertically.";
1876
+ interface ChatCompletionOptions {
1877
+ messages: ChatMessage[];
1878
+ temperature?: number;
1879
+ top_p?: number;
1880
+ frequency_penalty?: number;
1881
+ presence_penalty?: number;
1882
+ image?: {
1883
+ buffer: Buffer;
1884
+ description?: string;
1885
+ };
1886
+ response_model?: {
1887
+ name: string;
1888
+ schema: StagehandZodSchema;
1873
1889
  };
1890
+ tools?: LLMTool[];
1891
+ tool_choice?: "auto" | "none" | "required";
1892
+ maxOutputTokens?: number;
1893
+ requestId?: string;
1874
1894
  }
1875
- interface AgentReplayWaitStep {
1876
- type: "wait";
1877
- timeMs: number;
1895
+ type LLMResponse = {
1896
+ id: string;
1897
+ object: string;
1898
+ created: number;
1899
+ model: string;
1900
+ choices: {
1901
+ index: number;
1902
+ message: {
1903
+ role: string;
1904
+ content: string | null;
1905
+ tool_calls: {
1906
+ id: string;
1907
+ type: string;
1908
+ function: {
1909
+ name: string;
1910
+ arguments: string;
1911
+ };
1912
+ }[];
1913
+ };
1914
+ finish_reason: string;
1915
+ }[];
1916
+ usage: {
1917
+ prompt_tokens: number;
1918
+ completion_tokens: number;
1919
+ total_tokens: number;
1920
+ };
1921
+ };
1922
+ interface CreateChatCompletionOptions {
1923
+ options: ChatCompletionOptions;
1924
+ logger: (message: LogLine) => void;
1925
+ retries?: number;
1878
1926
  }
1879
- interface AgentReplayNavBackStep {
1880
- type: "navback";
1881
- waitUntil?: LoadState;
1927
+ /** Simple usage shape if your LLM returns usage tokens. */
1928
+ interface LLMUsage {
1929
+ prompt_tokens: number;
1930
+ completion_tokens: number;
1931
+ total_tokens: number;
1932
+ reasoning_tokens?: number;
1933
+ cached_input_tokens?: number;
1934
+ }
1935
+ /**
1936
+ * For calls that use a schema: the LLMClient may return { data: T; usage?: LLMUsage }
1937
+ */
1938
+ interface LLMParsedResponse<T> {
1939
+ data: T;
1940
+ usage?: LLMUsage;
1941
+ }
1942
+ declare abstract class LLMClient {
1943
+ type: "openai" | "anthropic" | "cerebras" | "groq" | (string & {});
1944
+ modelName: AvailableModel | (string & {});
1945
+ hasVision: boolean;
1946
+ clientOptions: ClientOptions;
1947
+ userProvidedInstructions?: string;
1948
+ constructor(modelName: AvailableModel, userProvidedInstructions?: string);
1949
+ abstract createChatCompletion<T>(options: CreateChatCompletionOptions & {
1950
+ options: {
1951
+ response_model: {
1952
+ name: string;
1953
+ schema: StagehandZodSchema;
1954
+ };
1955
+ };
1956
+ }): Promise<LLMParsedResponse<T>>;
1957
+ abstract createChatCompletion<T = LLMResponse>(options: CreateChatCompletionOptions): Promise<T>;
1958
+ generateObject: typeof generateObject;
1959
+ generateText: typeof generateText;
1960
+ streamText: typeof streamText;
1961
+ streamObject: typeof streamObject;
1962
+ generateImage: typeof experimental_generateImage;
1963
+ embed: typeof embed;
1964
+ embedMany: typeof embedMany;
1965
+ transcribe: typeof experimental_transcribe;
1966
+ generateSpeech: typeof experimental_generateSpeech;
1967
+ getLanguageModel?(): LanguageModelV2;
1882
1968
  }
1883
1969
 
1884
1970
  /**
@@ -1911,7 +1997,11 @@ declare class V3 {
1911
1997
  private readonly domSettleTimeoutMs?;
1912
1998
  private _isClosing;
1913
1999
  browserbaseSessionId?: string;
2000
+ private browserbaseSessionUrl?;
2001
+ private browserbaseDebugUrl?;
1914
2002
  get browserbaseSessionID(): string | undefined;
2003
+ get browserbaseSessionURL(): string | undefined;
2004
+ get browserbaseDebugURL(): string | undefined;
1915
2005
  private _onCdpClosed;
1916
2006
  readonly experimental: boolean;
1917
2007
  readonly logInferenceToFile: boolean;
@@ -1959,6 +2049,7 @@ declare class V3 {
1959
2049
  /** Apply post-connect local browser options that require CDP. */
1960
2050
  private _applyPostConnectLocalOptions;
1961
2051
  private _ensureBrowserbaseDownloadsEnabled;
2052
+ private resetBrowserbaseSessionMetadata;
1962
2053
  /**
1963
2054
  * Run an "act" instruction through the ActHandler.
1964
2055
  *
@@ -2010,11 +2101,27 @@ declare class V3 {
2010
2101
  /** Resolve an external page reference or fall back to the active V3 page. */
2011
2102
  private resolvePage;
2012
2103
  private normalizeToV3Page;
2104
+ private _logBrowserbaseSessionStatus;
2105
+ /**
2106
+ * Prepares shared context for agent execution (both execute and stream).
2107
+ * Extracts duplicated setup logic into a single helper.
2108
+ */
2109
+ private prepareAgentExecution;
2013
2110
  /**
2014
2111
  * Create a v3 agent instance (AISDK tool-based) with execute().
2015
2112
  * Mirrors the v2 Stagehand.agent() tool mode (no CUA provider here).
2113
+ *
2114
+ * @overload When stream: true, returns a streaming agent where execute() returns AgentStreamResult
2115
+ * @overload When stream is false/undefined, returns a non-streaming agent where execute() returns AgentResult
2016
2116
  */
2017
- agent(options?: AgentConfig): {
2117
+ agent(options: AgentConfig & {
2118
+ stream: true;
2119
+ }): {
2120
+ execute: (instructionOrOptions: string | AgentExecuteOptions) => Promise<AgentStreamResult>;
2121
+ };
2122
+ agent(options?: AgentConfig & {
2123
+ stream?: false;
2124
+ }): {
2018
2125
  execute: (instructionOrOptions: string | AgentExecuteOptions) => Promise<AgentResult>;
2019
2126
  };
2020
2127
  }
@@ -2026,7 +2133,7 @@ declare class V3 {
2026
2133
  declare abstract class AgentClient {
2027
2134
  type: AgentType;
2028
2135
  modelName: string;
2029
- clientOptions: Record<string, unknown>;
2136
+ clientOptions: ClientOptions;
2030
2137
  userProvidedInstructions?: string;
2031
2138
  constructor(type: AgentType, modelName: string, userProvidedInstructions?: string);
2032
2139
  abstract execute(options: AgentExecutionOptions): Promise<AgentResult>;
@@ -2049,7 +2156,7 @@ declare class AgentProvider {
2049
2156
  * Create a new agent provider
2050
2157
  */
2051
2158
  constructor(logger: (message: LogLine) => void);
2052
- getClient(modelName: string, clientOptions?: Record<string, unknown>, userProvidedInstructions?: string, tools?: ToolSet$1): AgentClient;
2159
+ getClient(modelName: string, clientOptions?: ClientOptions, userProvidedInstructions?: string, tools?: ToolSet$1): AgentClient;
2053
2160
  static getAgentProvider(modelName: string): AgentProviderType;
2054
2161
  }
2055
2162
 
@@ -2144,4 +2251,4 @@ declare class V3Evaluator {
2144
2251
  private _evaluateWithMultipleScreenshots;
2145
2252
  }
2146
2253
 
2147
- export { type AISDKCustomProvider, type AISDKProvider, AISdkClient, AVAILABLE_CUA_MODELS, type ActOptions, type ActResult, type Action, type ActionExecutionResult, type AgentAction, type AgentConfig, type AgentExecuteOptions, type AgentExecutionOptions, type AgentHandlerOptions, type AgentInstance, type AgentModelConfig, AgentProvider, type AgentProviderType, type AgentResult, AgentScreenshotProviderError, type AgentType, AnnotatedScreenshotText, type AnthropicContentBlock, type AnthropicJsonSchemaObject, type AnthropicMessage, type AnthropicTextBlock, type AnthropicToolResult, type AnyPage, type AvailableCuaModel, type AvailableModel, BrowserbaseSessionNotFoundError, CaptchaTimeoutError, type ChatCompletionOptions, type ChatMessage, type ChatMessageContent, type ChatMessageImageContent, type ChatMessageTextContent, type ClientOptions, type ComputerCallItem, ConnectionTimeoutError, type ConsoleListener, ConsoleMessage, ContentFrameNotFoundError, type CreateChatCompletionOptions, CreateChatCompletionResponseError, CuaModelRequiredError, ElementNotVisibleError, ExperimentalApiConflictError, ExperimentalNotConfiguredError, type ExtractOptions, type ExtractResult, type FunctionCallItem, HandlerNotInitializedError, type HistoryEntry, type InferStagehandSchema, InvalidAISDKModelFormatError, type JsonSchema, type JsonSchemaDocument, type JsonSchemaProperty, LLMClient, type LLMParsedResponse, type LLMResponse, LLMResponseError, type LLMTool, type LLMUsage, LOG_LEVEL_NAMES, type LoadState, type LocalBrowserLaunchOptions, type LogLevel, type LogLine, type Logger, MCPConnectionError, MissingEnvironmentVariableError, MissingLLMConfigurationError, type ModelConfiguration, type ModelProvider, type ObserveOptions, Page, PageNotFoundError, Response$1 as Response, ResponseBodyError, type ResponseInputItem, type ResponseItem, ResponseParseError, V3 as Stagehand, StagehandAPIError, StagehandAPIUnauthorizedError, StagehandClickError, StagehandDefaultError, StagehandDomProcessError, StagehandElementNotFoundError, StagehandEnvironmentError, StagehandError, StagehandEvalError, StagehandHttpError, StagehandIframeError, StagehandInitError, StagehandInvalidArgumentError, type StagehandMetrics, StagehandMissingArgumentError, StagehandNotInitializedError, StagehandResponseBodyError, StagehandResponseParseError, StagehandServerError, StagehandShadowRootMissingError, StagehandShadowSegmentEmptyError, StagehandShadowSegmentNotFoundError, type StagehandZodObject, type StagehandZodSchema, TimeoutError, type ToolUseItem, UnsupportedAISDKModelProviderError, UnsupportedModelError, UnsupportedModelProviderError, V3, type V3Env, V3Evaluator, V3FunctionName, type V3Options, XPathResolutionError, ZodSchemaValidationError, connectToMCPServer, defaultExtractSchema, getZodType, injectUrls, isRunningInBun, isZod3Schema, isZod4Schema, jsonSchemaToZod, loadApiKeyFromEnv, modelToAgentProviderMap, pageTextSchema, providerEnvVarMap, toGeminiSchema, toJsonSchema, transformSchema, trimTrailingTextNode, validateZodSchema };
2254
+ export { type AISDKCustomProvider, type AISDKProvider, AISdkClient, AVAILABLE_CUA_MODELS, type ActOptions, type ActResult, type Action, type ActionExecutionResult, type AgentAction, type AgentConfig, type AgentContext, type AgentExecuteOptions, type AgentExecutionOptions, type AgentHandlerOptions, type AgentInstance, type AgentModelConfig, AgentProvider, type AgentProviderType, type AgentResult, AgentScreenshotProviderError, type AgentState, type AgentStreamResult, type AgentType, AnnotatedScreenshotText, type AnthropicClientOptions, type AnthropicContentBlock, type AnthropicJsonSchemaObject, type AnthropicMessage, type AnthropicTextBlock, type AnthropicToolResult, type AnyPage, type AvailableCuaModel, type AvailableModel, BrowserbaseSessionNotFoundError, CaptchaTimeoutError, type ChatCompletionOptions, type ChatMessage, type ChatMessageContent, type ChatMessageImageContent, type ChatMessageTextContent, type ClientOptions, type ComputerCallItem, ConnectionTimeoutError, type ConsoleListener, ConsoleMessage, ContentFrameNotFoundError, type CreateChatCompletionOptions, CreateChatCompletionResponseError, CuaModelRequiredError, ElementNotVisibleError, ExperimentalApiConflictError, ExperimentalNotConfiguredError, type ExtractOptions, type ExtractResult, type FunctionCallItem, type GoogleServiceAccountCredentials, type GoogleVertexProviderSettings, HandlerNotInitializedError, type HistoryEntry, type InferStagehandSchema, InvalidAISDKModelFormatError, type JsonSchema, type JsonSchemaDocument, type JsonSchemaProperty, LLMClient, type LLMParsedResponse, type LLMResponse, LLMResponseError, type LLMTool, type LLMUsage, LOG_LEVEL_NAMES, type LoadState, type LocalBrowserLaunchOptions, type LogLevel, type LogLine, type Logger, MCPConnectionError, MissingEnvironmentVariableError, MissingLLMConfigurationError, type ModelConfiguration, type ModelProvider, type NonStreamingAgentInstance, type ObserveOptions, type OpenAIClientOptions, Page, PageNotFoundError, Response$1 as Response, ResponseBodyError, type ResponseInputItem, type ResponseItem, ResponseParseError, V3 as Stagehand, StagehandAPIError, StagehandAPIUnauthorizedError, StagehandClickError, StagehandDefaultError, StagehandDomProcessError, StagehandElementNotFoundError, StagehandEnvironmentError, StagehandError, StagehandEvalError, StagehandHttpError, StagehandIframeError, StagehandInitError, StagehandInvalidArgumentError, type StagehandMetrics, StagehandMissingArgumentError, StagehandNotInitializedError, StagehandResponseBodyError, StagehandResponseParseError, StagehandServerError, StagehandShadowRootMissingError, StagehandShadowSegmentEmptyError, StagehandShadowSegmentNotFoundError, type StagehandZodObject, type StagehandZodSchema, type StreamingAgentInstance, TimeoutError, type ToolUseItem, UnsupportedAISDKModelProviderError, UnsupportedModelError, UnsupportedModelProviderError, V3, type V3Env, V3Evaluator, V3FunctionName, type V3Options, XPathResolutionError, ZodSchemaValidationError, connectToMCPServer, defaultExtractSchema, getZodType, injectUrls, isRunningInBun, isZod3Schema, isZod4Schema, jsonSchemaToZod, loadApiKeyFromEnv, modelToAgentProviderMap, pageTextSchema, providerEnvVarMap, toGeminiSchema, toJsonSchema, transformSchema, trimTrailingTextNode, validateZodSchema };