npm - @browserbasehq/stagehand - Versions diffs - 1.4.0 → 1.5.0 - Mend

@browserbasehq/stagehand 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/lib/cache/ActionCache.ts +158 -0
package/lib/cache/BaseCache.ts +553 -0
package/lib/cache/LLMCache.ts +48 -0
package/lib/cache.ts +99 -0
package/lib/dom/build/index.js +626 -0
package/lib/dom/build/scriptContent.ts +1 -0
package/lib/dom/debug.ts +147 -0
package/lib/dom/genDomScripts.ts +29 -0
package/lib/dom/global.d.ts +25 -0
package/lib/dom/index.ts +3 -0
package/lib/dom/process.ts +441 -0
package/lib/dom/utils.ts +17 -0
package/lib/dom/xpathUtils.ts +246 -0
package/lib/handlers/actHandler.ts +1421 -0
package/lib/handlers/extractHandler.ts +179 -0
package/lib/handlers/observeHandler.ts +170 -0
package/lib/index.ts +900 -0
package/lib/inference.ts +324 -0
package/lib/llm/AnthropicClient.ts +314 -0
package/lib/llm/LLMClient.ts +66 -0
package/lib/llm/LLMProvider.ts +81 -0
package/lib/llm/OpenAIClient.ts +206 -0
package/lib/prompt.ts +341 -0
package/lib/utils.ts +16 -0
package/lib/vision.ts +299 -0
package/package.json +3 -3

package/lib/inference.ts ADDED Viewed

@@ -0,0 +1,324 @@
+import {
+  actTools,
+  buildActSystemPrompt,
+  buildActUserPrompt,
+  buildAskSystemPrompt,
+  buildExtractSystemPrompt,
+  buildExtractUserPrompt,
+  buildObserveSystemPrompt,
+  buildObserveUserMessage,
+  buildAskUserPrompt,
+  buildVerifyActCompletionSystemPrompt,
+  buildVerifyActCompletionUserPrompt,
+  buildRefineSystemPrompt,
+  buildRefineUserPrompt,
+  buildMetadataSystemPrompt,
+  buildMetadataPrompt,
+} from "./prompt";
+import { z } from "zod";
+import {
+  AnnotatedScreenshotText,
+  ChatMessage,
+  LLMClient,
+} from "./llm/LLMClient";
+import { VerifyActCompletionParams } from "../types/inference";
+import { ActResult, ActParams } from "../types/act";
+export async function verifyActCompletion({
+  goal,
+  steps,
+  llmClient,
+  screenshot,
+  domElements,
+  logger,
+  requestId,
+}: VerifyActCompletionParams): Promise<boolean> {
+  const messages: ChatMessage[] = [
+    buildVerifyActCompletionSystemPrompt(),
+    buildVerifyActCompletionUserPrompt(goal, steps, domElements),
+  ];
+  const response = await llmClient.createChatCompletion({
+    messages,
+    temperature: 0.1,
+    top_p: 1,
+    frequency_penalty: 0,
+    presence_penalty: 0,
+    image: screenshot
+      ? {
+          buffer: screenshot,
+          description: "This is a screenshot of the whole visible page.",
+        }
+      : undefined,
+    response_model: {
+      name: "Verification",
+      schema: z.object({
+        completed: z.boolean().describe("true if the goal is accomplished"),
+      }),
+    },
+    requestId,
+  });
+  if (!response || typeof response !== "object") {
+    logger({
+      category: "VerifyAct",
+      message: "Unexpected response format: " + JSON.stringify(response),
+    });
+    return false;
+  }
+  if (response.completed === undefined) {
+    logger({
+      category: "VerifyAct",
+      message: "Missing 'completed' field in response",
+    });
+    return false;
+  }
+  return response.completed;
+}
+export function fillInVariables(
+  text: string,
+  variables: Record<string, string>,
+) {
+  let processedText = text;
+  Object.entries(variables).forEach(([key, value]) => {
+    const placeholder = `<|${key.toUpperCase()}|>`;
+    processedText = processedText.replace(placeholder, value);
+  });
+  return processedText;
+}
+export async function act({
+  action,
+  domElements,
+  steps,
+  llmClient,
+  screenshot,
+  retries = 0,
+  logger,
+  requestId,
+  variables,
+}: ActParams): Promise<ActResult | null> {
+  const messages: ChatMessage[] = [
+    buildActSystemPrompt(),
+    buildActUserPrompt(action, steps, domElements, variables),
+  ];
+  const response = await llmClient.createChatCompletion({
+    messages,
+    temperature: 0.1,
+    top_p: 1,
+    frequency_penalty: 0,
+    presence_penalty: 0,
+    tool_choice: "auto" as const,
+    tools: actTools,
+    image: screenshot
+      ? { buffer: screenshot, description: AnnotatedScreenshotText }
+      : undefined,
+    requestId,
+  });
+  const toolCalls = response.choices[0].message.tool_calls;
+  if (toolCalls && toolCalls.length > 0) {
+    if (toolCalls[0].function.name === "skipSection") {
+      return null;
+    }
+    return JSON.parse(toolCalls[0].function.arguments);
+  } else {
+    if (retries >= 2) {
+      logger({
+        category: "Act",
+        message: "No tool calls found in response",
+      });
+      return null;
+    }
+    return act({
+      action,
+      domElements,
+      steps,
+      llmClient,
+      retries: retries + 1,
+      logger,
+      requestId,
+    });
+  }
+}
+export async function extract({
+  instruction,
+  progress,
+  previouslyExtractedContent,
+  domElements,
+  schema,
+  llmClient,
+  chunksSeen,
+  chunksTotal,
+  requestId,
+}: {
+  instruction: string;
+  progress: string;
+  previouslyExtractedContent: any;
+  domElements: string;
+  schema: z.ZodObject<any>;
+  llmClient: LLMClient;
+  chunksSeen: number;
+  chunksTotal: number;
+  requestId: string;
+}) {
+  const extractionResponse = await llmClient.createChatCompletion({
+    messages: [
+      buildExtractSystemPrompt(),
+      buildExtractUserPrompt(instruction, domElements),
+    ],
+    response_model: {
+      schema: schema,
+      name: "Extraction",
+    },
+    temperature: 0.1,
+    top_p: 1,
+    frequency_penalty: 0,
+    presence_penalty: 0,
+    requestId,
+  });
+  const refinedResponse = await llmClient.createChatCompletion({
+    messages: [
+      buildRefineSystemPrompt(),
+      buildRefineUserPrompt(
+        instruction,
+        previouslyExtractedContent,
+        extractionResponse,
+      ),
+    ],
+    response_model: {
+      schema: schema,
+      name: "RefinedExtraction",
+    },
+    temperature: 0.1,
+    top_p: 1,
+    frequency_penalty: 0,
+    presence_penalty: 0,
+    requestId,
+  });
+  const metadataSchema = z.object({
+    progress: z
+      .string()
+      .describe(
+        "progress of what has been extracted so far, as concise as possible",
+      ),
+    completed: z
+      .boolean()
+      .describe(
+        "true if the goal is now accomplished. Use this conservatively, only when you are sure that the goal has been completed.",
+      ),
+  });
+  const metadataResponse = await llmClient.createChatCompletion({
+    messages: [
+      buildMetadataSystemPrompt(),
+      buildMetadataPrompt(
+        instruction,
+        refinedResponse,
+        chunksSeen,
+        chunksTotal,
+      ),
+    ],
+    response_model: {
+      name: "Metadata",
+      schema: metadataSchema,
+    },
+    temperature: 0.1,
+    top_p: 1,
+    frequency_penalty: 0,
+    presence_penalty: 0,
+    requestId,
+  });
+  refinedResponse.metadata = metadataResponse;
+  return refinedResponse;
+}
+export async function observe({
+  instruction,
+  domElements,
+  llmClient,
+  image,
+  requestId,
+}: {
+  instruction: string;
+  domElements: string;
+  llmClient: LLMClient;
+  image?: Buffer;
+  requestId: string;
+}): Promise<{
+  elements: { elementId: number; description: string }[];
+}> {
+  const observeSchema = z.object({
+    elements: z
+      .array(
+        z.object({
+          elementId: z.number().describe("the number of the element"),
+          description: z
+            .string()
+            .describe(
+              "a description of the element and what it is relevant for",
+            ),
+        }),
+      )
+      .describe("an array of elements that match the instruction"),
+  });
+  const observationResponse = await llmClient.createChatCompletion({
+    messages: [
+      buildObserveSystemPrompt(),
+      buildObserveUserMessage(instruction, domElements),
+    ],
+    image: image
+      ? { buffer: image, description: AnnotatedScreenshotText }
+      : undefined,
+    response_model: {
+      schema: observeSchema,
+      name: "Observation",
+    },
+    temperature: 0.1,
+    top_p: 1,
+    frequency_penalty: 0,
+    presence_penalty: 0,
+    requestId,
+  });
+  if (!observationResponse) {
+    throw new Error("no response when finding a selector");
+  }
+  return observationResponse;
+}
+export async function ask({
+  question,
+  llmClient,
+  requestId,
+}: {
+  question: string;
+  llmClient: LLMClient;
+  requestId: string;
+}) {
+  const response = await llmClient.createChatCompletion({
+    messages: [buildAskSystemPrompt(), buildAskUserPrompt(question)],
+    temperature: 0.1,
+    top_p: 1,
+    frequency_penalty: 0,
+    presence_penalty: 0,
+    requestId,
+  });
+  // The parsing is now handled in the LLM clients
+  return response.choices[0].message.content;
+}

package/lib/llm/AnthropicClient.ts ADDED Viewed

@@ -0,0 +1,314 @@
+import Anthropic, { ClientOptions } from "@anthropic-ai/sdk";
+import { Message, MessageCreateParams } from "@anthropic-ai/sdk/resources";
+import { zodToJsonSchema } from "zod-to-json-schema";
+import { LogLine } from "../../types/log";
+import { AvailableModel } from "../../types/model";
+import { LLMCache } from "../cache/LLMCache";
+import { ChatCompletionOptions, LLMClient } from "./LLMClient";
+export class AnthropicClient extends LLMClient {
+  private client: Anthropic;
+  private cache: LLMCache | undefined;
+  public logger: (message: LogLine) => void;
+  private enableCaching: boolean;
+  constructor(
+    logger: (message: LogLine) => void,
+    enableCaching = false,
+    cache: LLMCache | undefined,
+    modelName: AvailableModel,
+    clientOptions?: ClientOptions,
+  ) {
+    super(modelName);
+    this.client = new Anthropic(clientOptions);
+    this.logger = logger;
+    this.cache = cache;
+    this.enableCaching = enableCaching;
+    this.modelName = modelName;
+  }
+  async createChatCompletion(
+    options: ChatCompletionOptions & { retries?: number },
+  ): Promise<any> {
+    // TODO (kamath): remove this forced typecast
+    const { image: _, ...optionsWithoutImage } = options;
+    this.logger({
+      category: "anthropic",
+      message: "creating chat completion",
+      level: 1,
+      auxiliary: {
+        options: {
+          value: JSON.stringify(optionsWithoutImage),
+          type: "object",
+        },
+      },
+    });
+    // Try to get cached response
+    const cacheOptions = {
+      model: this.modelName,
+      messages: options.messages,
+      temperature: options.temperature,
+      image: options.image,
+      response_model: options.response_model,
+      tools: options.tools,
+      retries: options.retries,
+    };
+    if (this.enableCaching) {
+      const cachedResponse = await this.cache.get(
+        cacheOptions,
+        options.requestId,
+      );
+      if (cachedResponse) {
+        this.logger({
+          category: "llm_cache",
+          message: "LLM cache hit - returning cached response",
+          level: 1,
+          auxiliary: {
+            cachedResponse: {
+              value: JSON.stringify(cachedResponse),
+              type: "object",
+            },
+            requestId: {
+              value: options.requestId,
+              type: "string",
+            },
+            cacheOptions: {
+              value: JSON.stringify(cacheOptions),
+              type: "object",
+            },
+          },
+        });
+        return cachedResponse;
+      } else {
+        this.logger({
+          category: "llm_cache",
+          message: "LLM cache miss - no cached response found",
+          level: 1,
+          auxiliary: {
+            cacheOptions: {
+              value: JSON.stringify(cacheOptions),
+              type: "object",
+            },
+            requestId: {
+              value: options.requestId,
+              type: "string",
+            },
+          },
+        });
+      }
+    }
+    const systemMessage = options.messages.find((msg) => msg.role === "system");
+    const userMessages = options.messages.filter(
+      (msg) => msg.role !== "system",
+    );
+    if (options.image) {
+      const screenshotMessage: any = {
+        role: "user",
+        content: [
+          {
+            type: "image",
+            source: {
+              type: "base64",
+              media_type: "image/jpeg",
+              data: options.image.buffer.toString("base64"),
+            },
+          },
+          ...(options.image.description
+            ? [{ type: "text", text: options.image.description }]
+            : []),
+        ],
+      };
+      options.messages = [...options.messages, screenshotMessage];
+    }
+    // Transform tools to Anthropic's format
+    let anthropicTools = options.tools?.map((tool: any) => {
+      if (tool.type === "function") {
+        return {
+          name: tool.function.name,
+          description: tool.function.description,
+          input_schema: {
+            type: "object",
+            properties: tool.function.parameters.properties,
+            required: tool.function.parameters.required,
+          },
+        };
+      }
+      return tool;
+    });
+    let toolDefinition;
+    if (options.response_model) {
+      const jsonSchema = zodToJsonSchema(options.response_model.schema);
+      // Extract the actual schema properties
+      // TODO (kamath): fix this forced typecast
+      const schemaProperties =
+        (
+          jsonSchema.definitions?.MySchema as {
+            properties?: Record<string, any>;
+          }
+        )?.properties ||
+        (jsonSchema as { properties?: Record<string, any> }).properties;
+      const schemaRequired =
+        (jsonSchema.definitions?.MySchema as { required?: string[] })
+          ?.required || (jsonSchema as { required?: string[] }).required;
+      toolDefinition = {
+        name: "print_extracted_data",
+        description: "Prints the extracted data based on the provided schema.",
+        input_schema: {
+          type: "object",
+          properties: schemaProperties,
+          required: schemaRequired,
+        },
+      };
+    }
+    if (toolDefinition) {
+      anthropicTools = anthropicTools ?? [];
+      anthropicTools.push(toolDefinition);
+    }
+    const response = (await this.client.messages.create({
+      model: this.modelName,
+      max_tokens: options.maxTokens || 3000,
+      messages: userMessages.map((msg) => ({
+        role: msg.role,
+        content: msg.content,
+      })),
+      tools: anthropicTools,
+      system: systemMessage?.content,
+      temperature: options.temperature,
+    } as MessageCreateParams)) as Message; // TODO (kamath): remove this forced typecast
+    this.logger({
+      category: "anthropic",
+      message: "response",
+      level: 1,
+      auxiliary: {
+        response: {
+          value: JSON.stringify(response),
+          type: "object",
+        },
+        requestId: {
+          value: options.requestId,
+          type: "string",
+        },
+      },
+    });
+    // Parse the response here
+    const transformedResponse = {
+      id: response.id,
+      object: "chat.completion",
+      created: Date.now(),
+      model: response.model,
+      choices: [
+        {
+          index: 0,
+          message: {
+            role: "assistant",
+            content:
+              response.content.find((c) => c.type === "text")?.text || null,
+            tool_calls: response.content
+              .filter((c) => c.type === "tool_use")
+              .map((toolUse: any) => ({
+                id: toolUse.id,
+                type: "function",
+                function: {
+                  name: toolUse.name,
+                  arguments: JSON.stringify(toolUse.input),
+                },
+              })),
+          },
+          finish_reason: response.stop_reason,
+        },
+      ],
+      usage: {
+        prompt_tokens: response.usage.input_tokens,
+        completion_tokens: response.usage.output_tokens,
+        total_tokens:
+          response.usage.input_tokens + response.usage.output_tokens,
+      },
+    };
+    this.logger({
+      category: "anthropic",
+      message: "transformed response",
+      level: 1,
+      auxiliary: {
+        transformedResponse: {
+          value: JSON.stringify(transformedResponse),
+          type: "object",
+        },
+        requestId: {
+          value: options.requestId,
+          type: "string",
+        },
+      },
+    });
+    if (options.response_model) {
+      const toolUse = response.content.find((c) => c.type === "tool_use");
+      if (toolUse && "input" in toolUse) {
+        const result = toolUse.input;
+        if (this.enableCaching) {
+          this.cache.set(cacheOptions, result, options.requestId);
+        }
+        return result;
+      } else {
+        if (!options.retries || options.retries < 5) {
+          return this.createChatCompletion({
+            ...options,
+            retries: (options.retries ?? 0) + 1,
+          });
+        }
+        this.logger({
+          category: "anthropic",
+          message: "error creating chat completion",
+          level: 1,
+          auxiliary: {
+            requestId: {
+              value: options.requestId,
+              type: "string",
+            },
+          },
+        });
+        throw new Error(
+          "Create Chat Completion Failed: No tool use with input in response",
+        );
+      }
+    }
+    if (this.enableCaching) {
+      this.cache.set(cacheOptions, transformedResponse, options.requestId);
+      this.logger({
+        category: "anthropic",
+        message: "cached response",
+        level: 1,
+        auxiliary: {
+          requestId: {
+            value: options.requestId,
+            type: "string",
+          },
+          transformedResponse: {
+            value: JSON.stringify(transformedResponse),
+            type: "object",
+          },
+          cacheOptions: {
+            value: JSON.stringify(cacheOptions),
+            type: "object",
+          },
+        },
+      });
+    }
+    return transformedResponse;
+  }
+}

package/lib/llm/LLMClient.ts ADDED Viewed

@@ -0,0 +1,66 @@
+import { AvailableModel, ToolCall } from "../../types/model";
+export interface ChatMessage {
+  role: "system" | "user" | "assistant";
+  content: ChatMessageContent;
+}
+export type ChatMessageContent =
+  | string
+  | (ChatMessageImageContent | ChatMessageTextContent)[];
+export interface ChatMessageImageContent {
+  type: "image_url";
+  image_url: { url: string };
+  text?: string;
+}
+export interface ChatMessageTextContent {
+  type: string;
+  text: string;
+}
+export const modelsWithVision: AvailableModel[] = [
+  "gpt-4o",
+  "gpt-4o-mini",
+  "claude-3-5-sonnet-latest",
+  "claude-3-5-sonnet-20240620",
+  "claude-3-5-sonnet-20241022",
+  "gpt-4o-2024-08-06",
+];
+export const AnnotatedScreenshotText =
+  "This is a screenshot of the current page state with the elements annotated on it. Each element id is annotated with a number to the top left of it. Duplicate annotations at the same location are under each other vertically.";
+export interface ChatCompletionOptions {
+  messages: ChatMessage[];
+  temperature?: number;
+  top_p?: number;
+  frequency_penalty?: number;
+  presence_penalty?: number;
+  image?: {
+    buffer: Buffer;
+    description?: string;
+  };
+  response_model?: {
+    name: string;
+    schema: any;
+  };
+  tools?: ToolCall[];
+  tool_choice?: string;
+  maxTokens?: number;
+  requestId: string;
+}
+export abstract class LLMClient {
+  public modelName: AvailableModel;
+  public hasVision: boolean;
+  constructor(modelName: AvailableModel) {
+    this.modelName = modelName;
+    this.hasVision = modelsWithVision.includes(modelName);
+  }
+  abstract createChatCompletion(options: ChatCompletionOptions): Promise<any>;
+  abstract logger: (message: { category?: string; message: string }) => void;
+}