@browserbasehq/stagehand 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,324 @@
1
+ import {
2
+ actTools,
3
+ buildActSystemPrompt,
4
+ buildActUserPrompt,
5
+ buildAskSystemPrompt,
6
+ buildExtractSystemPrompt,
7
+ buildExtractUserPrompt,
8
+ buildObserveSystemPrompt,
9
+ buildObserveUserMessage,
10
+ buildAskUserPrompt,
11
+ buildVerifyActCompletionSystemPrompt,
12
+ buildVerifyActCompletionUserPrompt,
13
+ buildRefineSystemPrompt,
14
+ buildRefineUserPrompt,
15
+ buildMetadataSystemPrompt,
16
+ buildMetadataPrompt,
17
+ } from "./prompt";
18
+ import { z } from "zod";
19
+ import {
20
+ AnnotatedScreenshotText,
21
+ ChatMessage,
22
+ LLMClient,
23
+ } from "./llm/LLMClient";
24
+ import { VerifyActCompletionParams } from "../types/inference";
25
+ import { ActResult, ActParams } from "../types/act";
26
+
27
+ export async function verifyActCompletion({
28
+ goal,
29
+ steps,
30
+ llmClient,
31
+ screenshot,
32
+ domElements,
33
+ logger,
34
+ requestId,
35
+ }: VerifyActCompletionParams): Promise<boolean> {
36
+ const messages: ChatMessage[] = [
37
+ buildVerifyActCompletionSystemPrompt(),
38
+ buildVerifyActCompletionUserPrompt(goal, steps, domElements),
39
+ ];
40
+
41
+ const response = await llmClient.createChatCompletion({
42
+ messages,
43
+ temperature: 0.1,
44
+ top_p: 1,
45
+ frequency_penalty: 0,
46
+ presence_penalty: 0,
47
+ image: screenshot
48
+ ? {
49
+ buffer: screenshot,
50
+ description: "This is a screenshot of the whole visible page.",
51
+ }
52
+ : undefined,
53
+ response_model: {
54
+ name: "Verification",
55
+ schema: z.object({
56
+ completed: z.boolean().describe("true if the goal is accomplished"),
57
+ }),
58
+ },
59
+ requestId,
60
+ });
61
+
62
+ if (!response || typeof response !== "object") {
63
+ logger({
64
+ category: "VerifyAct",
65
+ message: "Unexpected response format: " + JSON.stringify(response),
66
+ });
67
+ return false;
68
+ }
69
+
70
+ if (response.completed === undefined) {
71
+ logger({
72
+ category: "VerifyAct",
73
+ message: "Missing 'completed' field in response",
74
+ });
75
+ return false;
76
+ }
77
+
78
+ return response.completed;
79
+ }
80
+
81
+ export function fillInVariables(
82
+ text: string,
83
+ variables: Record<string, string>,
84
+ ) {
85
+ let processedText = text;
86
+ Object.entries(variables).forEach(([key, value]) => {
87
+ const placeholder = `<|${key.toUpperCase()}|>`;
88
+ processedText = processedText.replace(placeholder, value);
89
+ });
90
+ return processedText;
91
+ }
92
+
93
+ export async function act({
94
+ action,
95
+ domElements,
96
+ steps,
97
+ llmClient,
98
+ screenshot,
99
+ retries = 0,
100
+ logger,
101
+ requestId,
102
+ variables,
103
+ }: ActParams): Promise<ActResult | null> {
104
+ const messages: ChatMessage[] = [
105
+ buildActSystemPrompt(),
106
+ buildActUserPrompt(action, steps, domElements, variables),
107
+ ];
108
+
109
+ const response = await llmClient.createChatCompletion({
110
+ messages,
111
+ temperature: 0.1,
112
+ top_p: 1,
113
+ frequency_penalty: 0,
114
+ presence_penalty: 0,
115
+ tool_choice: "auto" as const,
116
+ tools: actTools,
117
+ image: screenshot
118
+ ? { buffer: screenshot, description: AnnotatedScreenshotText }
119
+ : undefined,
120
+ requestId,
121
+ });
122
+
123
+ const toolCalls = response.choices[0].message.tool_calls;
124
+
125
+ if (toolCalls && toolCalls.length > 0) {
126
+ if (toolCalls[0].function.name === "skipSection") {
127
+ return null;
128
+ }
129
+
130
+ return JSON.parse(toolCalls[0].function.arguments);
131
+ } else {
132
+ if (retries >= 2) {
133
+ logger({
134
+ category: "Act",
135
+ message: "No tool calls found in response",
136
+ });
137
+ return null;
138
+ }
139
+
140
+ return act({
141
+ action,
142
+ domElements,
143
+ steps,
144
+ llmClient,
145
+ retries: retries + 1,
146
+ logger,
147
+ requestId,
148
+ });
149
+ }
150
+ }
151
+
152
+ export async function extract({
153
+ instruction,
154
+ progress,
155
+ previouslyExtractedContent,
156
+ domElements,
157
+ schema,
158
+ llmClient,
159
+ chunksSeen,
160
+ chunksTotal,
161
+ requestId,
162
+ }: {
163
+ instruction: string;
164
+ progress: string;
165
+ previouslyExtractedContent: any;
166
+ domElements: string;
167
+ schema: z.ZodObject<any>;
168
+ llmClient: LLMClient;
169
+ chunksSeen: number;
170
+ chunksTotal: number;
171
+ requestId: string;
172
+ }) {
173
+ const extractionResponse = await llmClient.createChatCompletion({
174
+ messages: [
175
+ buildExtractSystemPrompt(),
176
+ buildExtractUserPrompt(instruction, domElements),
177
+ ],
178
+ response_model: {
179
+ schema: schema,
180
+ name: "Extraction",
181
+ },
182
+ temperature: 0.1,
183
+ top_p: 1,
184
+ frequency_penalty: 0,
185
+ presence_penalty: 0,
186
+ requestId,
187
+ });
188
+
189
+ const refinedResponse = await llmClient.createChatCompletion({
190
+ messages: [
191
+ buildRefineSystemPrompt(),
192
+ buildRefineUserPrompt(
193
+ instruction,
194
+ previouslyExtractedContent,
195
+ extractionResponse,
196
+ ),
197
+ ],
198
+ response_model: {
199
+ schema: schema,
200
+ name: "RefinedExtraction",
201
+ },
202
+ temperature: 0.1,
203
+ top_p: 1,
204
+ frequency_penalty: 0,
205
+ presence_penalty: 0,
206
+ requestId,
207
+ });
208
+
209
+ const metadataSchema = z.object({
210
+ progress: z
211
+ .string()
212
+ .describe(
213
+ "progress of what has been extracted so far, as concise as possible",
214
+ ),
215
+ completed: z
216
+ .boolean()
217
+ .describe(
218
+ "true if the goal is now accomplished. Use this conservatively, only when you are sure that the goal has been completed.",
219
+ ),
220
+ });
221
+
222
+ const metadataResponse = await llmClient.createChatCompletion({
223
+ messages: [
224
+ buildMetadataSystemPrompt(),
225
+ buildMetadataPrompt(
226
+ instruction,
227
+ refinedResponse,
228
+ chunksSeen,
229
+ chunksTotal,
230
+ ),
231
+ ],
232
+ response_model: {
233
+ name: "Metadata",
234
+ schema: metadataSchema,
235
+ },
236
+ temperature: 0.1,
237
+ top_p: 1,
238
+ frequency_penalty: 0,
239
+ presence_penalty: 0,
240
+ requestId,
241
+ });
242
+
243
+ refinedResponse.metadata = metadataResponse;
244
+
245
+ return refinedResponse;
246
+ }
247
+
248
+ export async function observe({
249
+ instruction,
250
+ domElements,
251
+ llmClient,
252
+ image,
253
+ requestId,
254
+ }: {
255
+ instruction: string;
256
+ domElements: string;
257
+ llmClient: LLMClient;
258
+ image?: Buffer;
259
+ requestId: string;
260
+ }): Promise<{
261
+ elements: { elementId: number; description: string }[];
262
+ }> {
263
+ const observeSchema = z.object({
264
+ elements: z
265
+ .array(
266
+ z.object({
267
+ elementId: z.number().describe("the number of the element"),
268
+ description: z
269
+ .string()
270
+ .describe(
271
+ "a description of the element and what it is relevant for",
272
+ ),
273
+ }),
274
+ )
275
+ .describe("an array of elements that match the instruction"),
276
+ });
277
+
278
+ const observationResponse = await llmClient.createChatCompletion({
279
+ messages: [
280
+ buildObserveSystemPrompt(),
281
+ buildObserveUserMessage(instruction, domElements),
282
+ ],
283
+ image: image
284
+ ? { buffer: image, description: AnnotatedScreenshotText }
285
+ : undefined,
286
+ response_model: {
287
+ schema: observeSchema,
288
+ name: "Observation",
289
+ },
290
+ temperature: 0.1,
291
+ top_p: 1,
292
+ frequency_penalty: 0,
293
+ presence_penalty: 0,
294
+ requestId,
295
+ });
296
+
297
+ if (!observationResponse) {
298
+ throw new Error("no response when finding a selector");
299
+ }
300
+
301
+ return observationResponse;
302
+ }
303
+
304
+ export async function ask({
305
+ question,
306
+ llmClient,
307
+ requestId,
308
+ }: {
309
+ question: string;
310
+ llmClient: LLMClient;
311
+ requestId: string;
312
+ }) {
313
+ const response = await llmClient.createChatCompletion({
314
+ messages: [buildAskSystemPrompt(), buildAskUserPrompt(question)],
315
+ temperature: 0.1,
316
+ top_p: 1,
317
+ frequency_penalty: 0,
318
+ presence_penalty: 0,
319
+ requestId,
320
+ });
321
+
322
+ // The parsing is now handled in the LLM clients
323
+ return response.choices[0].message.content;
324
+ }
@@ -0,0 +1,314 @@
1
+ import Anthropic, { ClientOptions } from "@anthropic-ai/sdk";
2
+ import { Message, MessageCreateParams } from "@anthropic-ai/sdk/resources";
3
+ import { zodToJsonSchema } from "zod-to-json-schema";
4
+ import { LogLine } from "../../types/log";
5
+ import { AvailableModel } from "../../types/model";
6
+ import { LLMCache } from "../cache/LLMCache";
7
+ import { ChatCompletionOptions, LLMClient } from "./LLMClient";
8
+
9
+ export class AnthropicClient extends LLMClient {
10
+ private client: Anthropic;
11
+ private cache: LLMCache | undefined;
12
+ public logger: (message: LogLine) => void;
13
+ private enableCaching: boolean;
14
+
15
+ constructor(
16
+ logger: (message: LogLine) => void,
17
+ enableCaching = false,
18
+ cache: LLMCache | undefined,
19
+ modelName: AvailableModel,
20
+ clientOptions?: ClientOptions,
21
+ ) {
22
+ super(modelName);
23
+ this.client = new Anthropic(clientOptions);
24
+ this.logger = logger;
25
+ this.cache = cache;
26
+ this.enableCaching = enableCaching;
27
+ this.modelName = modelName;
28
+ }
29
+
30
+ async createChatCompletion(
31
+ options: ChatCompletionOptions & { retries?: number },
32
+ ): Promise<any> {
33
+ // TODO (kamath): remove this forced typecast
34
+ const { image: _, ...optionsWithoutImage } = options;
35
+ this.logger({
36
+ category: "anthropic",
37
+ message: "creating chat completion",
38
+ level: 1,
39
+ auxiliary: {
40
+ options: {
41
+ value: JSON.stringify(optionsWithoutImage),
42
+ type: "object",
43
+ },
44
+ },
45
+ });
46
+ // Try to get cached response
47
+ const cacheOptions = {
48
+ model: this.modelName,
49
+ messages: options.messages,
50
+ temperature: options.temperature,
51
+ image: options.image,
52
+ response_model: options.response_model,
53
+ tools: options.tools,
54
+ retries: options.retries,
55
+ };
56
+
57
+ if (this.enableCaching) {
58
+ const cachedResponse = await this.cache.get(
59
+ cacheOptions,
60
+ options.requestId,
61
+ );
62
+ if (cachedResponse) {
63
+ this.logger({
64
+ category: "llm_cache",
65
+ message: "LLM cache hit - returning cached response",
66
+ level: 1,
67
+ auxiliary: {
68
+ cachedResponse: {
69
+ value: JSON.stringify(cachedResponse),
70
+ type: "object",
71
+ },
72
+ requestId: {
73
+ value: options.requestId,
74
+ type: "string",
75
+ },
76
+ cacheOptions: {
77
+ value: JSON.stringify(cacheOptions),
78
+ type: "object",
79
+ },
80
+ },
81
+ });
82
+ return cachedResponse;
83
+ } else {
84
+ this.logger({
85
+ category: "llm_cache",
86
+ message: "LLM cache miss - no cached response found",
87
+ level: 1,
88
+ auxiliary: {
89
+ cacheOptions: {
90
+ value: JSON.stringify(cacheOptions),
91
+ type: "object",
92
+ },
93
+ requestId: {
94
+ value: options.requestId,
95
+ type: "string",
96
+ },
97
+ },
98
+ });
99
+ }
100
+ }
101
+
102
+ const systemMessage = options.messages.find((msg) => msg.role === "system");
103
+ const userMessages = options.messages.filter(
104
+ (msg) => msg.role !== "system",
105
+ );
106
+
107
+ if (options.image) {
108
+ const screenshotMessage: any = {
109
+ role: "user",
110
+ content: [
111
+ {
112
+ type: "image",
113
+ source: {
114
+ type: "base64",
115
+ media_type: "image/jpeg",
116
+ data: options.image.buffer.toString("base64"),
117
+ },
118
+ },
119
+ ...(options.image.description
120
+ ? [{ type: "text", text: options.image.description }]
121
+ : []),
122
+ ],
123
+ };
124
+
125
+ options.messages = [...options.messages, screenshotMessage];
126
+ }
127
+
128
+ // Transform tools to Anthropic's format
129
+ let anthropicTools = options.tools?.map((tool: any) => {
130
+ if (tool.type === "function") {
131
+ return {
132
+ name: tool.function.name,
133
+ description: tool.function.description,
134
+ input_schema: {
135
+ type: "object",
136
+ properties: tool.function.parameters.properties,
137
+ required: tool.function.parameters.required,
138
+ },
139
+ };
140
+ }
141
+ return tool;
142
+ });
143
+
144
+ let toolDefinition;
145
+ if (options.response_model) {
146
+ const jsonSchema = zodToJsonSchema(options.response_model.schema);
147
+
148
+ // Extract the actual schema properties
149
+ // TODO (kamath): fix this forced typecast
150
+ const schemaProperties =
151
+ (
152
+ jsonSchema.definitions?.MySchema as {
153
+ properties?: Record<string, any>;
154
+ }
155
+ )?.properties ||
156
+ (jsonSchema as { properties?: Record<string, any> }).properties;
157
+ const schemaRequired =
158
+ (jsonSchema.definitions?.MySchema as { required?: string[] })
159
+ ?.required || (jsonSchema as { required?: string[] }).required;
160
+
161
+ toolDefinition = {
162
+ name: "print_extracted_data",
163
+ description: "Prints the extracted data based on the provided schema.",
164
+ input_schema: {
165
+ type: "object",
166
+ properties: schemaProperties,
167
+ required: schemaRequired,
168
+ },
169
+ };
170
+ }
171
+
172
+ if (toolDefinition) {
173
+ anthropicTools = anthropicTools ?? [];
174
+ anthropicTools.push(toolDefinition);
175
+ }
176
+
177
+ const response = (await this.client.messages.create({
178
+ model: this.modelName,
179
+ max_tokens: options.maxTokens || 3000,
180
+ messages: userMessages.map((msg) => ({
181
+ role: msg.role,
182
+ content: msg.content,
183
+ })),
184
+ tools: anthropicTools,
185
+ system: systemMessage?.content,
186
+ temperature: options.temperature,
187
+ } as MessageCreateParams)) as Message; // TODO (kamath): remove this forced typecast
188
+
189
+ this.logger({
190
+ category: "anthropic",
191
+ message: "response",
192
+ level: 1,
193
+ auxiliary: {
194
+ response: {
195
+ value: JSON.stringify(response),
196
+ type: "object",
197
+ },
198
+ requestId: {
199
+ value: options.requestId,
200
+ type: "string",
201
+ },
202
+ },
203
+ });
204
+
205
+ // Parse the response here
206
+ const transformedResponse = {
207
+ id: response.id,
208
+ object: "chat.completion",
209
+ created: Date.now(),
210
+ model: response.model,
211
+ choices: [
212
+ {
213
+ index: 0,
214
+ message: {
215
+ role: "assistant",
216
+ content:
217
+ response.content.find((c) => c.type === "text")?.text || null,
218
+ tool_calls: response.content
219
+ .filter((c) => c.type === "tool_use")
220
+ .map((toolUse: any) => ({
221
+ id: toolUse.id,
222
+ type: "function",
223
+ function: {
224
+ name: toolUse.name,
225
+ arguments: JSON.stringify(toolUse.input),
226
+ },
227
+ })),
228
+ },
229
+ finish_reason: response.stop_reason,
230
+ },
231
+ ],
232
+ usage: {
233
+ prompt_tokens: response.usage.input_tokens,
234
+ completion_tokens: response.usage.output_tokens,
235
+ total_tokens:
236
+ response.usage.input_tokens + response.usage.output_tokens,
237
+ },
238
+ };
239
+
240
+ this.logger({
241
+ category: "anthropic",
242
+ message: "transformed response",
243
+ level: 1,
244
+ auxiliary: {
245
+ transformedResponse: {
246
+ value: JSON.stringify(transformedResponse),
247
+ type: "object",
248
+ },
249
+ requestId: {
250
+ value: options.requestId,
251
+ type: "string",
252
+ },
253
+ },
254
+ });
255
+
256
+ if (options.response_model) {
257
+ const toolUse = response.content.find((c) => c.type === "tool_use");
258
+ if (toolUse && "input" in toolUse) {
259
+ const result = toolUse.input;
260
+ if (this.enableCaching) {
261
+ this.cache.set(cacheOptions, result, options.requestId);
262
+ }
263
+
264
+ return result;
265
+ } else {
266
+ if (!options.retries || options.retries < 5) {
267
+ return this.createChatCompletion({
268
+ ...options,
269
+ retries: (options.retries ?? 0) + 1,
270
+ });
271
+ }
272
+ this.logger({
273
+ category: "anthropic",
274
+ message: "error creating chat completion",
275
+ level: 1,
276
+ auxiliary: {
277
+ requestId: {
278
+ value: options.requestId,
279
+ type: "string",
280
+ },
281
+ },
282
+ });
283
+ throw new Error(
284
+ "Create Chat Completion Failed: No tool use with input in response",
285
+ );
286
+ }
287
+ }
288
+
289
+ if (this.enableCaching) {
290
+ this.cache.set(cacheOptions, transformedResponse, options.requestId);
291
+ this.logger({
292
+ category: "anthropic",
293
+ message: "cached response",
294
+ level: 1,
295
+ auxiliary: {
296
+ requestId: {
297
+ value: options.requestId,
298
+ type: "string",
299
+ },
300
+ transformedResponse: {
301
+ value: JSON.stringify(transformedResponse),
302
+ type: "object",
303
+ },
304
+ cacheOptions: {
305
+ value: JSON.stringify(cacheOptions),
306
+ type: "object",
307
+ },
308
+ },
309
+ });
310
+ }
311
+
312
+ return transformedResponse;
313
+ }
314
+ }
@@ -0,0 +1,66 @@
1
+ import { AvailableModel, ToolCall } from "../../types/model";
2
+
3
+ export interface ChatMessage {
4
+ role: "system" | "user" | "assistant";
5
+ content: ChatMessageContent;
6
+ }
7
+
8
+ export type ChatMessageContent =
9
+ | string
10
+ | (ChatMessageImageContent | ChatMessageTextContent)[];
11
+
12
+ export interface ChatMessageImageContent {
13
+ type: "image_url";
14
+ image_url: { url: string };
15
+ text?: string;
16
+ }
17
+
18
+ export interface ChatMessageTextContent {
19
+ type: string;
20
+ text: string;
21
+ }
22
+
23
+ export const modelsWithVision: AvailableModel[] = [
24
+ "gpt-4o",
25
+ "gpt-4o-mini",
26
+ "claude-3-5-sonnet-latest",
27
+ "claude-3-5-sonnet-20240620",
28
+ "claude-3-5-sonnet-20241022",
29
+ "gpt-4o-2024-08-06",
30
+ ];
31
+
32
+ export const AnnotatedScreenshotText =
33
+ "This is a screenshot of the current page state with the elements annotated on it. Each element id is annotated with a number to the top left of it. Duplicate annotations at the same location are under each other vertically.";
34
+
35
+ export interface ChatCompletionOptions {
36
+ messages: ChatMessage[];
37
+ temperature?: number;
38
+ top_p?: number;
39
+ frequency_penalty?: number;
40
+ presence_penalty?: number;
41
+ image?: {
42
+ buffer: Buffer;
43
+ description?: string;
44
+ };
45
+ response_model?: {
46
+ name: string;
47
+ schema: any;
48
+ };
49
+ tools?: ToolCall[];
50
+ tool_choice?: string;
51
+ maxTokens?: number;
52
+ requestId: string;
53
+ }
54
+
55
+ export abstract class LLMClient {
56
+ public modelName: AvailableModel;
57
+ public hasVision: boolean;
58
+
59
+ constructor(modelName: AvailableModel) {
60
+ this.modelName = modelName;
61
+ this.hasVision = modelsWithVision.includes(modelName);
62
+ }
63
+
64
+ abstract createChatCompletion(options: ChatCompletionOptions): Promise<any>;
65
+ abstract logger: (message: { category?: string; message: string }) => void;
66
+ }