npm - @browserbasehq/orca - Versions diffs - 3.5.0-preview.0 → 3.5.0-preview.1 - Mend

@browserbasehq/orca 3.5.0-preview.0 → 3.5.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

package/dist/cjs/lib/v3LegacyEvaluator.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"v3LegacyEvaluator.js","sourceRoot":"","sources":["../../../lib/v3LegacyEvaluator.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;;AAEH,6BAAwB;AAWxB,4DAAsD;AACtD,iEAA+E;AAE/E,MAAM,gBAAgB,GAAG,OAAC,CAAC,MAAM,CAAC;IAChC,UAAU,EAAE,OAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IACjC,SAAS,EAAE,OAAC,CAAC,MAAM,EAAE;CACtB,CAAC,CAAC;AAEH,MAAM,qBAAqB,GAAG,OAAC,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;AAExD,MAAa,iBAAiB;IACpB,EAAE,CAAK;IACP,SAAS,CAAiB;IAC1B,kBAAkB,CAAqC;IACvD,YAAY,GAA+B,GAAG,EAAE,GAAE,CAAC,CAAC;IAE5D,YACE,EAAM,EACN,SAA0B,EAC1B,kBAAkC;QAElC,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC;QACb,IAAI,CAAC,SAAS,GAAG,SAAS,IAAK,yBAA4C,CAAC;QAC5E,IAAI,CAAC,kBAAkB,GAAG,kBAAkB,IAAI;YAC9C,MAAM,EACJ,OAAO,CAAC,GAAG,CAAC,cAAc;gBAC1B,OAAO,CAAC,GAAG,CAAC,4BAA4B;gBACxC,EAAE;SACL,CAAC;IACJ,CAAC;IAEO,SAAS;QACf,sEAAsE;QACtE,MAAM,QAAQ,GAAG,IAAI,4BAAW,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,CAAC,CAAC;QACjD,OAAO,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,kBAAkB,CAAC,CAAC;IACrE,CAAC;IAED,KAAK,CAAC,GAAG,CAAC,OAAwB;QAChC,MAAM,EACJ,QAAQ,EACR,MAAM,EACN,UAAU,GAAG,IAAI,EACjB,YAAY,EACZ,iBAAiB,GAAG,GAAG,EACvB,cAAc,GACf,GAAG,OAAO,CAAC;QACZ,IAAI,CAAC,QAAQ;YACX,MAAM,IAAI,4CAA6B,CACrC,oCAAoC,CACrC,CAAC;QACJ,IAAI,CAAC,MAAM,IAAI,CAAC,UAAU;YACxB,MAAM,IAAI,4CAA6B,CACrC,qDAAqD,CACtD,CAAC;QAEJ,IAAI,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAC,gCAAgC,CAAC;gBAC3C,QAAQ;gBACR,WAAW,EAAE,UAAU;gBACvB,YAAY;gBACZ,cAAc;aACf,CAAC,CAAC;QACL,CAAC;QAED,MAAM,mBAAmB,GAAG,kIAAkI,UAAU,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,sDAAsD,8HAA8H,IAAI,IAAI,EAAE,CAAC,kBAAkB,EAAE,EAAE,CAAC;QAElZ,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC,CAAC;QAC3D,IAAI,WAA+B,CAAC;QACpC,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;YACrD,WAAW,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,IAAI,mBAAmB,EAAE;oBAChE;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP;gCACE,IAAI,EAAE,MAAM;gCACZ,IAAI,EAAE,cAAc;oCAClB,CAAC,CAAC,aAAa,QAAQ,6CAA6C,cAAc,EAAE;oCACpF,CAAC,CAAC,QAAQ;6BACb;4BACD,GAAG,CAAC,UAAU,IAAI,WAAW;gCAC3B,CAAC,CAAC;oCACE;wCACE,IAAI,EAAE,WAAoB;wCAC1B,SAAS,EAAE;4CACT,GAAG,EAAE,0BAA0B,WAAW,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE;yCAChE;qCACF;iCACF;gCACH,CAAC,CAAC,EAAE,CAAC;4BACP,GAAG,CAAC,MAAM;gCACR,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,iBAAiB,MAAM,EAAE,EAAE,CAAC;gCAC9D,CAAC,CAAC,EAAE,CAAC;yBACR;qBACF;iBACF;gBACD,cAAc,EAAE,EAAE,IAAI,EAAE,kBAAkB,EAAE,MAAM,EAAE,gBAAgB,EAAE;aACvE;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,QAAQ,CAAC,IAEvB,CAAC;YACF,OAAO,EAAE,UAAU,EAAE,MAAM,CAAC,UAAU,EAAE,SAAS,EAAE,MAAM,CAAC,SAAS,EAAE,CAAC;QACxE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO;gBACL,UAAU,EAAE,SAAS;gBACrB,SAAS,EAAE,sCAAsC,YAAY,EAAE;aACvD,CAAC;QACb,CAAC;IACH,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,OAAwB;QACrC,MAAM,EACJ,SAAS,EACT,UAAU,GAAG,IAAI,EACjB,YAAY,GAAG,8EAA8E,EAC7F,iBAAiB,GAAG,GAAG,GACxB,GAAG,OAAO,CAAC;QACZ,IAAI,CAAC,SAAS,EAAE,MAAM;YACpB,MAAM,IAAI,4CAA6B,CACrC,iCAAiC,CAClC,CAAC;QAEJ,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC,CAAC;QAC3D,IAAI,WAA+B,CAAC;QACpC,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;YACrD,WAAW,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,SAAS,GAAG,SAAS;aACxB,GAAG,CACF,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CACV,GAAG,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,gBAAgB,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAClF;aACA,IAAI,CAAC,MAAM,CAAC,CAAC;QAEhB,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR;wBACE,IAAI,EAAE,QAAQ;wBACd,OAAO,EAAE,GAAG,YAAY,2CAA2C,UAAU,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,EAAE,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,6CAA6C,CAAC,CAAC,CAAC,EAAE,6KAA6K;qBAChX;oBACD;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE;4BACjC,GAAG,CAAC,UAAU,IAAI,WAAW;gCAC3B,CAAC,CAAC;oCACE;wCACE,IAAI,EAAE,WAAoB;wCAC1B,SAAS,EAAE;4CACT,GAAG,EAAE,0BAA0B,WAAW,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE;yCAChE;qCACF;iCACF;gCACH,CAAC,CAAC,EAAE,CAAC;yBACR;qBACF;iBACF;gBACD,cAAc,EAAE;oBACd,IAAI,EAAE,uBAAuB;oBAC7B,MAAM,EAAE,qBAAqB;iBAC9B;aACF;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,QAAQ,CAAC,IAExB,CAAC;YACF,OAAO,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBACzB,UAAU,EAAE,CAAC,CAAC,UAAU;gBACxB,SAAS,EAAE,CAAC,CAAC,SAAS;aACvB,CAAC,CAAC,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO,SAAS,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC;gBAC1B,UAAU,EAAE,SAAkB;gBAC9B,SAAS,EAAE,sCAAsC,YAAY,EAAE;aAChE,CAAC,CAAC,CAAC;QACN,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,gCAAgC,CAAC,OAK9C;QACC,MAAM,EACJ,QAAQ,EACR,WAAW,EACX,cAAc,EACd,YAAY,GAAG;UACX,cAAc,CAAC,CAAC,CAAC,iGAAiG,CAAC,CAAC,CAAC,EAAE;;;UAGvH,cAAc,CAAC,CAAC,CAAC,kNAAkN,CAAC,CAAC,CAAC,EAAE;0BACxN,IAAI,IAAI,EAAE,CAAC,kBAAkB,EAAE,EAAE,GACtD,GAAG,OAAO,CAAC;QAEZ,IAAI,CAAC,QAAQ;YACX,MAAM,IAAI,4CAA6B,CACrC,oCAAoC,CACrC,CAAC;QACJ,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC;YAC1C,MAAM,IAAI,4CAA6B,CACrC,0CAA0C,CAC3C,CAAC;QAEJ,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,aAAa,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YAC5C,IAAI,EAAE,WAAoB;YAC1B,SAAS,EAAE,EAAE,GAAG,EAAE,0BAA0B,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,EAAE;SACrE,CAAC,CAAC,CAAC;QAEJ,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,EAAE;oBACzC;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP;gCACE,IAAI,EAAE,MAAM;gCACZ,IAAI,EAAE,cAAc;oCAClB,CAAC,CAAC,aAAa,QAAQ,2DAA2D,cAAc,qBAAqB,WAAW,CAAC,MAAM,sKAAsK;oCAC7S,CAAC,CAAC,GAAG,QAAQ,qBAAqB,WAAW,CAAC,MAAM,mIAAmI;6BAC1L;4BACD,GAAG,aAAa;yBACjB;qBACF;iBACF;gBACD,cAAc,EAAE,EAAE,IAAI,EAAE,kBAAkB,EAAE,MAAM,EAAE,gBAAgB,EAAE;aACvE;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,QAAQ,CAAC,IAEvB,CAAC;YACF,OAAO,EAAE,UAAU,EAAE,MAAM,CAAC,UAAU,EAAE,SAAS,EAAE,MAAM,CAAC,SAAS,EAAE,CAAC;QACxE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO;gBACL,UAAU,EAAE,SAAS;gBACrB,SAAS,EAAE,sCAAsC,YAAY,EAAE;aACvD,CAAC;QACb,CAAC;IACH,CAAC;CACF;AA5QD,8CA4QC","sourcesContent":["/*\n Legacy V3 evaluator implementation.\n \n This is the behavior-preserving implementation that backs V3Evaluator when\n * STAGEHAND_EVALUATOR_BACKEND=legacy.\n */\n\nimport { z } from \"zod\";\nimport type { AvailableModel, ClientOptions } from \"./v3/types/public/model.js\";\nimport type {\n EvaluateOptions,\n BatchAskOptions,\n EvaluationResult,\n} from \"./v3/types/private/evaluator.js\";\nimport { LLMParsedResponse } from \"./inference.js\";\nimport { LLMResponse, LLMClient } from \"./v3/llm/LLMClient.js\";\nimport { LogLine } from \"./v3/types/public/logs.js\";\nimport { V3 } from \"./v3/v3.js\";\nimport { LLMProvider } from \"./v3/llm/LLMProvider.js\";\nimport { StagehandInvalidArgumentError } from \"./v3/types/public/sdkErrors.js\";\n\nconst EvaluationSchema = z.object({\n evaluation: z.enum([\"YES\", \"NO\"]),\n reasoning: z.string(),\n});\n\nconst BatchEvaluationSchema = z.array(EvaluationSchema);\n\nexport class LegacyV3Evaluator {\n private v3: V3;\n private modelName: AvailableModel;\n private modelClientOptions: ClientOptions \| { apiKey: string };\n private silentLogger: (message: LogLine) => void = () => {};\n\n constructor(\n v3: V3,\n modelName?: AvailableModel,\n modelClientOptions?: ClientOptions,\n ) {\n this.v3 = v3;\n this.modelName = modelName \|\| (\"google/gemini-2.5-flash\" as AvailableModel);\n this.modelClientOptions = modelClientOptions \|\| {\n apiKey:\n process.env.GEMINI_API_KEY \|\|\n process.env.GOOGLE_GENERATIVE_AI_API_KEY \|\|\n \"\",\n };\n }\n\n private getClient(): LLMClient {\n // Prefer a dedicated provider so we can override model per-evaluation\n const provider = new LLMProvider(this.v3.logger);\n return provider.getClient(this.modelName, this.modelClientOptions);\n }\n\n async ask(options: EvaluateOptions): Promise<EvaluationResult> {\n const {\n question,\n answer,\n screenshot = true,\n systemPrompt,\n screenshotDelayMs = 250,\n agentReasoning,\n } = options;\n if (!question)\n throw new StagehandInvalidArgumentError(\n \"Question cannot be an empty string\",\n );\n if (!answer && !screenshot)\n throw new StagehandInvalidArgumentError(\n \"Either answer (text) or screenshot must be provided\",\n );\n\n if (Array.isArray(screenshot)) {\n return this._evaluateWithMultipleScreenshots({\n question,\n screenshots: screenshot,\n systemPrompt,\n agentReasoning,\n });\n }\n\n const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO based on if the original goal was achieved. You have access to ${screenshot ? \"a screenshot\" : \"the agents reasoning and actions throughout the task\"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.\\n Today's date is ${new Date().toLocaleDateString()}`;\n\n await new Promise((r) => setTimeout(r, screenshotDelayMs));\n let imageBuffer: Buffer \| undefined;\n if (screenshot) {\n const page = await this.v3.context.awaitActivePage();\n imageBuffer = await page.screenshot({ fullPage: false });\n }\n\n const llmClient = this.getClient();\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n { role: \"system\", content: systemPrompt \|\| defaultSystemPrompt },\n {\n role: \"user\",\n content: [\n {\n type: \"text\",\n text: agentReasoning\n ? `Question: ${question}\\n\\nAgent's reasoning and actions taken:\\n${agentReasoning}`\n : question,\n },\n ...(screenshot && imageBuffer\n ? [\n {\n type: \"image_url\" as const,\n image_url: {\n url: `data:image/jpeg;base64,${imageBuffer.toString(\"base64\")}`,\n },\n },\n ]\n : []),\n ...(answer\n ? [{ type: \"text\" as const, text: `the answer is ${answer}` }]\n : []),\n ],\n },\n ],\n response_model: { name: \"EvaluationResult\", schema: EvaluationSchema },\n },\n });\n\n try {\n const result = response.data as unknown as z.infer<\n typeof EvaluationSchema\n >;\n return { evaluation: result.evaluation, reasoning: result.reasoning };\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return {\n evaluation: \"INVALID\",\n reasoning: `Failed to get structured response: ${errorMessage}`,\n } as const;\n }\n }\n\n async batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]> {\n const {\n questions,\n screenshot = true,\n systemPrompt = \"You are an expert evaluator that returns YES or NO with a concise reasoning.\",\n screenshotDelayMs = 250,\n } = options;\n if (!questions?.length)\n throw new StagehandInvalidArgumentError(\n \"Questions array cannot be empty\",\n );\n\n await new Promise((r) => setTimeout(r, screenshotDelayMs));\n let imageBuffer: Buffer \| undefined;\n if (screenshot) {\n const page = await this.v3.context.awaitActivePage();\n imageBuffer = await page.screenshot({ fullPage: false });\n }\n\n const llmClient = this.getClient();\n\n const formatted = questions\n .map(\n (item, i) =>\n `${i + 1}. ${item.question}${item.answer ? `\\n Answer: ${item.answer}` : \"\"}`,\n )\n .join(\"\\n\\n\");\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n {\n role: \"system\",\n content: `${systemPrompt}\\n\\nYou will be given multiple questions${screenshot ? \" with a screenshot\" : \"\"}. ${questions.some((q) => q.answer) ? \"Some questions include answers to evaluate.\" : \"\"} Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,\n },\n {\n role: \"user\",\n content: [\n { type: \"text\", text: formatted },\n ...(screenshot && imageBuffer\n ? [\n {\n type: \"image_url\" as const,\n image_url: {\n url: `data:image/jpeg;base64,${imageBuffer.toString(\"base64\")}`,\n },\n },\n ]\n : []),\n ],\n },\n ],\n response_model: {\n name: \"BatchEvaluationResult\",\n schema: BatchEvaluationSchema,\n },\n },\n });\n\n try {\n const results = response.data as unknown as z.infer<\n typeof BatchEvaluationSchema\n >;\n return results.map((r) => ({\n evaluation: r.evaluation,\n reasoning: r.reasoning,\n }));\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return questions.map(() => ({\n evaluation: \"INVALID\" as const,\n reasoning: `Failed to get structured response: ${errorMessage}`,\n }));\n }\n }\n\n private async _evaluateWithMultipleScreenshots(options: {\n question: string;\n screenshots: Buffer[];\n systemPrompt?: string;\n agentReasoning?: string;\n }): Promise<EvaluationResult> {\n const {\n question,\n screenshots,\n agentReasoning,\n systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.\n ${agentReasoning ? \"You also have access to the agent's detailed reasoning and thought process throughout the task.\" : \"\"}\n Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.\n Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).\n ${agentReasoning ? \"The agent's reasoning provides crucial context about what actions were attempted, what was observed, and the decision-making process. Use this alongside the visual evidence to make a comprehensive evaluation.\" : \"\"}\n Today's date is ${new Date().toLocaleDateString()}`,\n } = options;\n\n if (!question)\n throw new StagehandInvalidArgumentError(\n \"Question cannot be an empty string\",\n );\n if (!screenshots \|\| screenshots.length === 0)\n throw new StagehandInvalidArgumentError(\n \"At least one screenshot must be provided\",\n );\n\n const llmClient = this.getClient();\n\n const imageContents = screenshots.map((s) => ({\n type: \"image_url\" as const,\n image_url: { url: `data:image/jpeg;base64,${s.toString(\"base64\")}` },\n }));\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n { role: \"system\", content: systemPrompt },\n {\n role: \"user\",\n content: [\n {\n type: \"text\",\n text: agentReasoning\n ? `Question: ${question}\\n\\nAgent's reasoning and actions throughout the task:\\n${agentReasoning}\\n\\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`\n : `${question}\\n\\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,\n },\n ...imageContents,\n ],\n },\n ],\n response_model: { name: \"EvaluationResult\", schema: EvaluationSchema },\n },\n });\n\n try {\n const result = response.data as unknown as z.infer<\n typeof EvaluationSchema\n >;\n return { evaluation: result.evaluation, reasoning: result.reasoning };\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return {\n evaluation: \"INVALID\",\n reasoning: `Failed to get structured response: ${errorMessage}`,\n } as const;\n }\n }\n}\n"]}
1	+ {"version":3,"file":"v3LegacyEvaluator.js","sourceRoot":"","sources":["../../../lib/v3LegacyEvaluator.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;;AAEH,6BAAwB;AAWxB,4DAAsD;AACtD,iEAA+E;AAE/E,MAAM,gBAAgB,GAAG,OAAC,CAAC,MAAM,CAAC;IAChC,UAAU,EAAE,OAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IACjC,SAAS,EAAE,OAAC,CAAC,MAAM,EAAE;CACtB,CAAC,CAAC;AAEH,MAAM,qBAAqB,GAAG,OAAC,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;AAExD,MAAa,iBAAiB;IACpB,EAAE,CAAK;IACP,SAAS,CAAiB;IAC1B,kBAAkB,CAAqC;IACvD,YAAY,GAA+B,GAAG,EAAE,GAAE,CAAC,CAAC;IAE5D,YACE,EAAM,EACN,SAA0B,EAC1B,kBAAkC;QAElC,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC;QACb,IAAI,CAAC,SAAS,GAAG,SAAS,IAAK,yBAA4C,CAAC;QAC5E,IAAI,CAAC,kBAAkB,GAAG,kBAAkB,IAAI;YAC9C,MAAM,EACJ,OAAO,CAAC,GAAG,CAAC,cAAc;gBAC1B,OAAO,CAAC,GAAG,CAAC,4BAA4B;gBACxC,EAAE;SACL,CAAC;IACJ,CAAC;IAEO,SAAS;QACf,sEAAsE;QACtE,MAAM,QAAQ,GAAG,IAAI,4BAAW,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,CAAC,CAAC;QACjD,OAAO,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,kBAAkB,CAAC,CAAC;IACrE,CAAC;IAED,KAAK,CAAC,GAAG,CAAC,OAAwB;QAChC,MAAM,EACJ,QAAQ,EACR,MAAM,EACN,UAAU,GAAG,IAAI,EACjB,YAAY,EACZ,iBAAiB,GAAG,GAAG,EACvB,cAAc,GACf,GAAG,OAAO,CAAC;QACZ,IAAI,CAAC,QAAQ;YACX,MAAM,IAAI,4CAA6B,CACrC,oCAAoC,CACrC,CAAC;QACJ,IAAI,CAAC,MAAM,IAAI,CAAC,UAAU;YACxB,MAAM,IAAI,4CAA6B,CACrC,qDAAqD,CACtD,CAAC;QAEJ,IAAI,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAC,gCAAgC,CAAC;gBAC3C,QAAQ;gBACR,MAAM;gBACN,WAAW,EAAE,UAAU;gBACvB,YAAY;gBACZ,cAAc;aACf,CAAC,CAAC;QACL,CAAC;QAED,MAAM,mBAAmB,GAAG,kIAAkI,UAAU,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,sDAAsD,8HAA8H,IAAI,IAAI,EAAE,CAAC,kBAAkB,EAAE,EAAE,CAAC;QAElZ,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC,CAAC;QAC3D,IAAI,WAA+B,CAAC;QACpC,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;YACrD,WAAW,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,IAAI,mBAAmB,EAAE;oBAChE;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP;gCACE,IAAI,EAAE,MAAM;gCACZ,IAAI,EAAE,cAAc;oCAClB,CAAC,CAAC,aAAa,QAAQ,6CAA6C,cAAc,EAAE;oCACpF,CAAC,CAAC,QAAQ;6BACb;4BACD,GAAG,CAAC,UAAU,IAAI,WAAW;gCAC3B,CAAC,CAAC;oCACE;wCACE,IAAI,EAAE,WAAoB;wCAC1B,SAAS,EAAE;4CACT,GAAG,EAAE,0BAA0B,WAAW,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE;yCAChE;qCACF;iCACF;gCACH,CAAC,CAAC,EAAE,CAAC;4BACP,GAAG,CAAC,MAAM;gCACR,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,iBAAiB,MAAM,EAAE,EAAE,CAAC;gCAC9D,CAAC,CAAC,EAAE,CAAC;yBACR;qBACF;iBACF;gBACD,cAAc,EAAE,EAAE,IAAI,EAAE,kBAAkB,EAAE,MAAM,EAAE,gBAAgB,EAAE;aACvE;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,QAAQ,CAAC,IAEvB,CAAC;YACF,OAAO,EAAE,UAAU,EAAE,MAAM,CAAC,UAAU,EAAE,SAAS,EAAE,MAAM,CAAC,SAAS,EAAE,CAAC;QACxE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO;gBACL,UAAU,EAAE,SAAS;gBACrB,SAAS,EAAE,sCAAsC,YAAY,EAAE;aACvD,CAAC;QACb,CAAC;IACH,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,OAAwB;QACrC,MAAM,EACJ,SAAS,EACT,UAAU,GAAG,IAAI,EACjB,YAAY,GAAG,8EAA8E,EAC7F,iBAAiB,GAAG,GAAG,GACxB,GAAG,OAAO,CAAC;QACZ,IAAI,CAAC,SAAS,EAAE,MAAM;YACpB,MAAM,IAAI,4CAA6B,CACrC,iCAAiC,CAClC,CAAC;QAEJ,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC,CAAC;QAC3D,IAAI,WAA+B,CAAC;QACpC,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;YACrD,WAAW,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,SAAS,GAAG,SAAS;aACxB,GAAG,CACF,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CACV,GAAG,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,gBAAgB,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAClF;aACA,IAAI,CAAC,MAAM,CAAC,CAAC;QAEhB,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR;wBACE,IAAI,EAAE,QAAQ;wBACd,OAAO,EAAE,GAAG,YAAY,2CAA2C,UAAU,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,EAAE,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,6CAA6C,CAAC,CAAC,CAAC,EAAE,6KAA6K;qBAChX;oBACD;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE;4BACjC,GAAG,CAAC,UAAU,IAAI,WAAW;gCAC3B,CAAC,CAAC;oCACE;wCACE,IAAI,EAAE,WAAoB;wCAC1B,SAAS,EAAE;4CACT,GAAG,EAAE,0BAA0B,WAAW,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE;yCAChE;qCACF;iCACF;gCACH,CAAC,CAAC,EAAE,CAAC;yBACR;qBACF;iBACF;gBACD,cAAc,EAAE;oBACd,IAAI,EAAE,uBAAuB;oBAC7B,MAAM,EAAE,qBAAqB;iBAC9B;aACF;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,QAAQ,CAAC,IAExB,CAAC;YACF,OAAO,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBACzB,UAAU,EAAE,CAAC,CAAC,UAAU;gBACxB,SAAS,EAAE,CAAC,CAAC,SAAS;aACvB,CAAC,CAAC,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO,SAAS,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC;gBAC1B,UAAU,EAAE,SAAkB;gBAC9B,SAAS,EAAE,sCAAsC,YAAY,EAAE;aAChE,CAAC,CAAC,CAAC;QACN,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,gCAAgC,CAAC,OAM9C;QACC,MAAM,EACJ,QAAQ,EACR,MAAM,EACN,WAAW,EACX,cAAc,EACd,YAAY,GAAG;UACX,cAAc,CAAC,CAAC,CAAC,iGAAiG,CAAC,CAAC,CAAC,EAAE;;;UAGvH,cAAc,CAAC,CAAC,CAAC,kNAAkN,CAAC,CAAC,CAAC,EAAE;0BACxN,IAAI,IAAI,EAAE,CAAC,kBAAkB,EAAE,EAAE,GACtD,GAAG,OAAO,CAAC;QAEZ,IAAI,CAAC,QAAQ;YACX,MAAM,IAAI,4CAA6B,CACrC,oCAAoC,CACrC,CAAC;QACJ,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC;YAC1C,MAAM,IAAI,4CAA6B,CACrC,0CAA0C,CAC3C,CAAC;QAEJ,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,aAAa,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YAC5C,IAAI,EAAE,WAAoB;YAC1B,SAAS,EAAE,EAAE,GAAG,EAAE,0BAA0B,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,EAAE;SACrE,CAAC,CAAC,CAAC;QAEJ,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,EAAE;oBACzC;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP;gCACE,IAAI,EAAE,MAAM;gCACZ,IAAI,EAAE,cAAc;oCAClB,CAAC,CAAC,aAAa,QAAQ,2DAA2D,cAAc,qBAAqB,WAAW,CAAC,MAAM,sKAAsK;oCAC7S,CAAC,CAAC,GAAG,QAAQ,qBAAqB,WAAW,CAAC,MAAM,mIAAmI;6BAC1L;4BACD,GAAG,CAAC,MAAM;gCACR,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,iBAAiB,MAAM,EAAE,EAAE,CAAC;gCAC9D,CAAC,CAAC,EAAE,CAAC;4BACP,GAAG,aAAa;yBACjB;qBACF;iBACF;gBACD,cAAc,EAAE,EAAE,IAAI,EAAE,kBAAkB,EAAE,MAAM,EAAE,gBAAgB,EAAE;aACvE;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,QAAQ,CAAC,IAEvB,CAAC;YACF,OAAO,EAAE,UAAU,EAAE,MAAM,CAAC,UAAU,EAAE,SAAS,EAAE,MAAM,CAAC,SAAS,EAAE,CAAC;QACxE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO;gBACL,UAAU,EAAE,SAAS;gBACrB,SAAS,EAAE,sCAAsC,YAAY,EAAE;aACvD,CAAC;QACb,CAAC;IACH,CAAC;CACF;AAlRD,8CAkRC","sourcesContent":["/*\n Legacy V3 evaluator implementation.\n \n This is the behavior-preserving implementation that backs V3Evaluator when\n * STAGEHAND_EVALUATOR_BACKEND=legacy.\n */\n\nimport { z } from \"zod\";\nimport type { AvailableModel, ClientOptions } from \"./v3/types/public/model.js\";\nimport type {\n EvaluateOptions,\n BatchAskOptions,\n EvaluationResult,\n} from \"./v3/types/private/evaluator.js\";\nimport { LLMParsedResponse } from \"./inference.js\";\nimport { LLMResponse, LLMClient } from \"./v3/llm/LLMClient.js\";\nimport { LogLine } from \"./v3/types/public/logs.js\";\nimport { V3 } from \"./v3/v3.js\";\nimport { LLMProvider } from \"./v3/llm/LLMProvider.js\";\nimport { StagehandInvalidArgumentError } from \"./v3/types/public/sdkErrors.js\";\n\nconst EvaluationSchema = z.object({\n evaluation: z.enum([\"YES\", \"NO\"]),\n reasoning: z.string(),\n});\n\nconst BatchEvaluationSchema = z.array(EvaluationSchema);\n\nexport class LegacyV3Evaluator {\n private v3: V3;\n private modelName: AvailableModel;\n private modelClientOptions: ClientOptions \| { apiKey: string };\n private silentLogger: (message: LogLine) => void = () => {};\n\n constructor(\n v3: V3,\n modelName?: AvailableModel,\n modelClientOptions?: ClientOptions,\n ) {\n this.v3 = v3;\n this.modelName = modelName \|\| (\"google/gemini-2.5-flash\" as AvailableModel);\n this.modelClientOptions = modelClientOptions \|\| {\n apiKey:\n process.env.GEMINI_API_KEY \|\|\n process.env.GOOGLE_GENERATIVE_AI_API_KEY \|\|\n \"\",\n };\n }\n\n private getClient(): LLMClient {\n // Prefer a dedicated provider so we can override model per-evaluation\n const provider = new LLMProvider(this.v3.logger);\n return provider.getClient(this.modelName, this.modelClientOptions);\n }\n\n async ask(options: EvaluateOptions): Promise<EvaluationResult> {\n const {\n question,\n answer,\n screenshot = true,\n systemPrompt,\n screenshotDelayMs = 250,\n agentReasoning,\n } = options;\n if (!question)\n throw new StagehandInvalidArgumentError(\n \"Question cannot be an empty string\",\n );\n if (!answer && !screenshot)\n throw new StagehandInvalidArgumentError(\n \"Either answer (text) or screenshot must be provided\",\n );\n\n if (Array.isArray(screenshot)) {\n return this._evaluateWithMultipleScreenshots({\n question,\n answer,\n screenshots: screenshot,\n systemPrompt,\n agentReasoning,\n });\n }\n\n const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO based on if the original goal was achieved. You have access to ${screenshot ? \"a screenshot\" : \"the agents reasoning and actions throughout the task\"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.\\n Today's date is ${new Date().toLocaleDateString()}`;\n\n await new Promise((r) => setTimeout(r, screenshotDelayMs));\n let imageBuffer: Buffer \| undefined;\n if (screenshot) {\n const page = await this.v3.context.awaitActivePage();\n imageBuffer = await page.screenshot({ fullPage: false });\n }\n\n const llmClient = this.getClient();\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n { role: \"system\", content: systemPrompt \|\| defaultSystemPrompt },\n {\n role: \"user\",\n content: [\n {\n type: \"text\",\n text: agentReasoning\n ? `Question: ${question}\\n\\nAgent's reasoning and actions taken:\\n${agentReasoning}`\n : question,\n },\n ...(screenshot && imageBuffer\n ? [\n {\n type: \"image_url\" as const,\n image_url: {\n url: `data:image/jpeg;base64,${imageBuffer.toString(\"base64\")}`,\n },\n },\n ]\n : []),\n ...(answer\n ? [{ type: \"text\" as const, text: `the answer is ${answer}` }]\n : []),\n ],\n },\n ],\n response_model: { name: \"EvaluationResult\", schema: EvaluationSchema },\n },\n });\n\n try {\n const result = response.data as unknown as z.infer<\n typeof EvaluationSchema\n >;\n return { evaluation: result.evaluation, reasoning: result.reasoning };\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return {\n evaluation: \"INVALID\",\n reasoning: `Failed to get structured response: ${errorMessage}`,\n } as const;\n }\n }\n\n async batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]> {\n const {\n questions,\n screenshot = true,\n systemPrompt = \"You are an expert evaluator that returns YES or NO with a concise reasoning.\",\n screenshotDelayMs = 250,\n } = options;\n if (!questions?.length)\n throw new StagehandInvalidArgumentError(\n \"Questions array cannot be empty\",\n );\n\n await new Promise((r) => setTimeout(r, screenshotDelayMs));\n let imageBuffer: Buffer \| undefined;\n if (screenshot) {\n const page = await this.v3.context.awaitActivePage();\n imageBuffer = await page.screenshot({ fullPage: false });\n }\n\n const llmClient = this.getClient();\n\n const formatted = questions\n .map(\n (item, i) =>\n `${i + 1}. ${item.question}${item.answer ? `\\n Answer: ${item.answer}` : \"\"}`,\n )\n .join(\"\\n\\n\");\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n {\n role: \"system\",\n content: `${systemPrompt}\\n\\nYou will be given multiple questions${screenshot ? \" with a screenshot\" : \"\"}. ${questions.some((q) => q.answer) ? \"Some questions include answers to evaluate.\" : \"\"} Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,\n },\n {\n role: \"user\",\n content: [\n { type: \"text\", text: formatted },\n ...(screenshot && imageBuffer\n ? [\n {\n type: \"image_url\" as const,\n image_url: {\n url: `data:image/jpeg;base64,${imageBuffer.toString(\"base64\")}`,\n },\n },\n ]\n : []),\n ],\n },\n ],\n response_model: {\n name: \"BatchEvaluationResult\",\n schema: BatchEvaluationSchema,\n },\n },\n });\n\n try {\n const results = response.data as unknown as z.infer<\n typeof BatchEvaluationSchema\n >;\n return results.map((r) => ({\n evaluation: r.evaluation,\n reasoning: r.reasoning,\n }));\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return questions.map(() => ({\n evaluation: \"INVALID\" as const,\n reasoning: `Failed to get structured response: ${errorMessage}`,\n }));\n }\n }\n\n private async _evaluateWithMultipleScreenshots(options: {\n question: string;\n answer?: string;\n screenshots: Buffer[];\n systemPrompt?: string;\n agentReasoning?: string;\n }): Promise<EvaluationResult> {\n const {\n question,\n answer,\n screenshots,\n agentReasoning,\n systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.\n ${agentReasoning ? \"You also have access to the agent's detailed reasoning and thought process throughout the task.\" : \"\"}\n Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.\n Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).\n ${agentReasoning ? \"The agent's reasoning provides crucial context about what actions were attempted, what was observed, and the decision-making process. Use this alongside the visual evidence to make a comprehensive evaluation.\" : \"\"}\n Today's date is ${new Date().toLocaleDateString()}`,\n } = options;\n\n if (!question)\n throw new StagehandInvalidArgumentError(\n \"Question cannot be an empty string\",\n );\n if (!screenshots \|\| screenshots.length === 0)\n throw new StagehandInvalidArgumentError(\n \"At least one screenshot must be provided\",\n );\n\n const llmClient = this.getClient();\n\n const imageContents = screenshots.map((s) => ({\n type: \"image_url\" as const,\n image_url: { url: `data:image/jpeg;base64,${s.toString(\"base64\")}` },\n }));\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n { role: \"system\", content: systemPrompt },\n {\n role: \"user\",\n content: [\n {\n type: \"text\",\n text: agentReasoning\n ? `Question: ${question}\\n\\nAgent's reasoning and actions throughout the task:\\n${agentReasoning}\\n\\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`\n : `${question}\\n\\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,\n },\n ...(answer\n ? [{ type: \"text\" as const, text: `the answer is ${answer}` }]\n : []),\n ...imageContents,\n ],\n },\n ],\n response_model: { name: \"EvaluationResult\", schema: EvaluationSchema },\n },\n });\n\n try {\n const result = response.data as unknown as z.infer<\n typeof EvaluationSchema\n >;\n return { evaluation: result.evaluation, reasoning: result.reasoning };\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return {\n evaluation: \"INVALID\",\n reasoning: `Failed to get structured response: ${errorMessage}`,\n } as const;\n }\n }\n}\n"]}

package/dist/esm/lib/v3/agent/utils/captureAriaTreeProbe.d.ts ADDED Viewed

@@ -0,0 +1,35 @@
+/**
+ * captureAriaTreeProbe — capture a truncated accessibility tree of the active
+ * page for use as tier-2 evidence in the trajectory recorder.
+ *
+ * Shared by v3AgentHandler and v3CuaAgentHandler. Listener-gated by the
+ * callers so ordinary agent runs (no TrajectoryRecorder attached) don't pay
+ * the cost.
+ *
+ * The a11y tree is the same payload the agent's `ariaTree` tool sees, but
+ * captured by the harness (not the agent) so the verifier has independent
+ * textual ground truth for grounding non-visual claims — prices, names,
+ * dates, list contents — without OCR'ing screenshots.
+ *
+ * Budget: defaults to ~8000 tokens (32k chars). Per-step a11y captures
+ * across a ~30-step trajectory at that cap sum to ~240k tokens total,
+ * which the verifier handles via per-criterion top-K selection. The cap
+ * is configurable via VERIFIER_ARIATREE_TOKEN_BUDGET so consumers can
+ * trade RAM/disk for fidelity. Truncated content is marked explicitly so
+ * the verifier knows it was clipped.
+ */
+import type { V3 } from "../../v3.js";
+interface CaptureAriaTreeOptions {
+    /** Soft cap on token count (chars/4 approximation). Default 8000. */
+    tokenBudget?: number;
+    /** Hard timeout on the capture. Default 5s. */
+    timeoutMs?: number;
+}
+/**
+ * Returns the truncated a11y tree as a plain string, or undefined when
+ * capture fails. Never throws — a11y capture is best-effort tier-2 evidence,
+ * not a hard requirement, so failures are silently absorbed (the verifier
+ * surfaces this via evidence_insufficient).
+ */
+export declare function captureAriaTreeProbe(v3: V3, opts?: CaptureAriaTreeOptions): Promise<string | undefined>;
+export {};

package/dist/esm/lib/v3/agent/utils/captureAriaTreeProbe.js ADDED Viewed

@@ -0,0 +1,35 @@
+const APPROX_CHARS_PER_TOKEN = 4;
+const DEFAULT_TOKEN_BUDGET = 8_000;
+const DEFAULT_TIMEOUT_MS = 5_000;
+/**
+ * Returns the truncated a11y tree as a plain string, or undefined when
+ * capture fails. Never throws — a11y capture is best-effort tier-2 evidence,
+ * not a hard requirement, so failures are silently absorbed (the verifier
+ * surfaces this via evidence_insufficient).
+ */
+export async function captureAriaTreeProbe(v3, opts = {}) {
+    const envBudget = parseInt(process.env.VERIFIER_ARIATREE_TOKEN_BUDGET ?? "", 10);
+    const tokenBudget = opts.tokenBudget ??
+        (Number.isFinite(envBudget) && envBudget > 0
+            ? envBudget
+            : DEFAULT_TOKEN_BUDGET);
+    const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
+    const maxChars = tokenBudget * APPROX_CHARS_PER_TOKEN;
+    try {
+        // v3.extract() without a schema returns { pageText } where pageText is the
+        // rendered accessibility tree — same path the agent's ariaTree tool uses.
+        const result = await v3.extract({ timeout: timeoutMs });
+        const pageText = result?.pageText;
+        if (typeof pageText !== "string" || pageText.length === 0)
+            return undefined;
+        if (pageText.length > maxChars) {
+            return (pageText.slice(0, maxChars) +
+                `\n\n[CONTENT TRUNCATED at ~${tokenBudget} tokens — set VERIFIER_ARIATREE_TOKEN_BUDGET to raise]`);
+        }
+        return pageText;
+    }
+    catch {
+        return undefined;
+    }
+}
+//# sourceMappingURL=captureAriaTreeProbe.js.map

package/dist/esm/lib/v3/agent/utils/captureAriaTreeProbe.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"captureAriaTreeProbe.js","sourceRoot":"","sources":["../../../../../../lib/v3/agent/utils/captureAriaTreeProbe.ts"],"names":[],"mappings":"AAsBA,MAAM,sBAAsB,GAAG,CAAC,CAAC;AACjC,MAAM,oBAAoB,GAAG,KAAK,CAAC;AACnC,MAAM,kBAAkB,GAAG,KAAK,CAAC;AASjC;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,EAAM,EACN,OAA+B,EAAE;IAEjC,MAAM,SAAS,GAAG,QAAQ,CACxB,OAAO,CAAC,GAAG,CAAC,8BAA8B,IAAI,EAAE,EAChD,EAAE,CACH,CAAC;IACF,MAAM,WAAW,GACf,IAAI,CAAC,WAAW;QAChB,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC,IAAI,SAAS,GAAG,CAAC;YAC1C,CAAC,CAAC,SAAS;YACX,CAAC,CAAC,oBAAoB,CAAC,CAAC;IAC5B,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,kBAAkB,CAAC;IACvD,MAAM,QAAQ,GAAG,WAAW,GAAG,sBAAsB,CAAC;IAEtD,IAAI,CAAC;QACH,2EAA2E;QAC3E,0EAA0E;QAC1E,MAAM,MAAM,GAAG,MAAM,EAAE,CAAC,OAAO,CAAC,EAAE,OAAO,EAAE,SAAS,EAAE,CAAC,CAAC;QACxD,MAAM,QAAQ,GAAG,MAAM,EAAE,QAAQ,CAAC;QAClC,IAAI,OAAO,QAAQ,KAAK,QAAQ,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,SAAS,CAAC;QAE5E,IAAI,QAAQ,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;YAC/B,OAAO,CACL,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC;gBAC3B,8BAA8B,WAAW,wDAAwD,CAClG,CAAC;QACJ,CAAC;QACD,OAAO,QAAQ,CAAC;IAClB,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,SAAS,CAAC;IACnB,CAAC;AACH,CAAC","sourcesContent":["/**\n * captureAriaTreeProbe — capture a truncated accessibility tree of the active\n * page for use as tier-2 evidence in the trajectory recorder.\n *\n * Shared by v3AgentHandler and v3CuaAgentHandler. Listener-gated by the\n * callers so ordinary agent runs (no TrajectoryRecorder attached) don't pay\n * the cost.\n *\n * The a11y tree is the same payload the agent's `ariaTree` tool sees, but\n * captured by the harness (not the agent) so the verifier has independent\n * textual ground truth for grounding non-visual claims — prices, names,\n * dates, list contents — without OCR'ing screenshots.\n *\n * Budget: defaults to ~8000 tokens (32k chars). Per-step a11y captures\n * across a ~30-step trajectory at that cap sum to ~240k tokens total,\n * which the verifier handles via per-criterion top-K selection. The cap\n * is configurable via VERIFIER_ARIATREE_TOKEN_BUDGET so consumers can\n * trade RAM/disk for fidelity. Truncated content is marked explicitly so\n * the verifier knows it was clipped.\n */\nimport type { V3 } from \"../../v3.js\";\n\nconst APPROX_CHARS_PER_TOKEN = 4;\nconst DEFAULT_TOKEN_BUDGET = 8_000;\nconst DEFAULT_TIMEOUT_MS = 5_000;\n\ninterface CaptureAriaTreeOptions {\n /** Soft cap on token count (chars/4 approximation). Default 8000. */\n tokenBudget?: number;\n /** Hard timeout on the capture. Default 5s. */\n timeoutMs?: number;\n}\n\n/**\n * Returns the truncated a11y tree as a plain string, or undefined when\n * capture fails. Never throws — a11y capture is best-effort tier-2 evidence,\n * not a hard requirement, so failures are silently absorbed (the verifier\n * surfaces this via evidence_insufficient).\n */\nexport async function captureAriaTreeProbe(\n v3: V3,\n opts: CaptureAriaTreeOptions = {},\n): Promise<string | undefined> {\n const envBudget = parseInt(\n process.env.VERIFIER_ARIATREE_TOKEN_BUDGET ?? \"\",\n 10,\n );\n const tokenBudget =\n opts.tokenBudget ??\n (Number.isFinite(envBudget) && envBudget > 0\n ? envBudget\n : DEFAULT_TOKEN_BUDGET);\n const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;\n const maxChars = tokenBudget * APPROX_CHARS_PER_TOKEN;\n\n try {\n // v3.extract() without a schema returns { pageText } where pageText is the\n // rendered accessibility tree — same path the agent's ariaTree tool uses.\n const result = await v3.extract({ timeout: timeoutMs });\n const pageText = result?.pageText;\n if (typeof pageText !== \"string\" || pageText.length === 0) return undefined;\n\n if (pageText.length > maxChars) {\n return (\n pageText.slice(0, maxChars) +\n `\\n\\n[CONTENT TRUNCATED at ~${tokenBudget} tokens — set VERIFIER_ARIATREE_TOKEN_BUDGET to raise]`\n );\n }\n return pageText;\n } catch {\n return undefined;\n }\n}\n"]}

package/dist/esm/lib/v3/agent/utils/postStepProbeEvidence.d.ts ADDED Viewed

@@ -0,0 +1,19 @@
+import type { AgentEvidenceCallback } from "../../types/public/agentEvidenceEvents.js";
+import type { LogLine } from "../../types/public/logs.js";
+import type { V3 } from "../../v3.js";
+interface CaptureProbeEvidenceOptions {
+    v3: V3;
+    url: string;
+    logger: (message: LogLine) => void;
+    warningMessage: string;
+}
+interface EmitPostStepProbeEvidenceOptions extends CaptureProbeEvidenceOptions {
+    evidenceCallback?: AgentEvidenceCallback;
+}
+export declare function captureProbeEvidence({ v3, url, logger, warningMessage, }: CaptureProbeEvidenceOptions): Promise<{
+    url: string;
+    screenshot?: Buffer;
+    ariaTree?: string;
+}>;
+export declare function emitPostStepProbeEvidence({ v3, url, evidenceCallback, logger, warningMessage, }: EmitPostStepProbeEvidenceOptions): Promise<void>;
+export {};

package/dist/esm/lib/v3/agent/utils/postStepProbeEvidence.js ADDED Viewed

@@ -0,0 +1,50 @@
+import { captureAriaTreeProbe } from "./captureAriaTreeProbe.js";
+function errorMessage(error) {
+    return error instanceof Error ? error.message : String(error);
+}
+export async function captureProbeEvidence({ v3, url, logger, warningMessage, }) {
+    let probeUrl = url;
+    let screenshot;
+    try {
+        const page = await v3.context.awaitActivePage();
+        probeUrl = page.url();
+        screenshot = await page.screenshot({ fullPage: false });
+    }
+    catch (e) {
+        logger({
+            category: "agent",
+            message: `${warningMessage}: ${errorMessage(e)}`,
+            level: 1,
+        });
+    }
+    const ariaTree = await captureAriaTreeProbe(v3);
+    return {
+        url: probeUrl,
+        ...(screenshot ? { screenshot } : {}),
+        ...(ariaTree !== undefined ? { ariaTree } : {}),
+    };
+}
+export async function emitPostStepProbeEvidence({ v3, url, evidenceCallback, logger, warningMessage, }) {
+    if (!evidenceCallback)
+        return;
+    const probe = await captureProbeEvidence({
+        v3,
+        url,
+        logger,
+        warningMessage,
+    });
+    if (probe.screenshot) {
+        await evidenceCallback({
+            type: "screenshot",
+            screenshot: probe.screenshot,
+            url: probe.url,
+            evidenceRole: "probe",
+        });
+    }
+    await evidenceCallback({
+        type: "step_observed",
+        url: probe.url,
+        ariaTree: probe.ariaTree,
+    });
+}
+//# sourceMappingURL=postStepProbeEvidence.js.map

package/dist/esm/lib/v3/agent/utils/postStepProbeEvidence.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"postStepProbeEvidence.js","sourceRoot":"","sources":["../../../../../../lib/v3/agent/utils/postStepProbeEvidence.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,oBAAoB,EAAE,MAAM,2BAA2B,CAAC;AAajE,SAAS,YAAY,CAAC,KAAc;IAClC,OAAO,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;AAChE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,oBAAoB,CAAC,EACzC,EAAE,EACF,GAAG,EACH,MAAM,EACN,cAAc,GACc;IAK5B,IAAI,QAAQ,GAAG,GAAG,CAAC;IACnB,IAAI,UAA8B,CAAC;IACnC,IAAI,CAAC;QACH,MAAM,IAAI,GAAG,MAAM,EAAE,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;QAChD,QAAQ,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACtB,UAAU,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;IAC1D,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,CAAC;YACL,QAAQ,EAAE,OAAO;YACjB,OAAO,EAAE,GAAG,cAAc,KAAK,YAAY,CAAC,CAAC,CAAC,EAAE;YAChD,KAAK,EAAE,CAAC;SACT,CAAC,CAAC;IACL,CAAC;IAED,MAAM,QAAQ,GAAG,MAAM,oBAAoB,CAAC,EAAE,CAAC,CAAC;IAChD,OAAO;QACL,GAAG,EAAE,QAAQ;QACb,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,UAAU,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACrC,GAAG,CAAC,QAAQ,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KAChD,CAAC;AACJ,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,yBAAyB,CAAC,EAC9C,EAAE,EACF,GAAG,EACH,gBAAgB,EAChB,MAAM,EACN,cAAc,GACmB;IACjC,IAAI,CAAC,gBAAgB;QAAE,OAAO;IAE9B,MAAM,KAAK,GAAG,MAAM,oBAAoB,CAAC;QACvC,EAAE;QACF,GAAG;QACH,MAAM;QACN,cAAc;KACf,CAAC,CAAC;IACH,IAAI,KAAK,CAAC,UAAU,EAAE,CAAC;QACrB,MAAM,gBAAgB,CAAC;YACrB,IAAI,EAAE,YAAY;YAClB,UAAU,EAAE,KAAK,CAAC,UAAU;YAC5B,GAAG,EAAE,KAAK,CAAC,GAAG;YACd,YAAY,EAAE,OAAO;SACtB,CAAC,CAAC;IACL,CAAC;IACD,MAAM,gBAAgB,CAAC;QACrB,IAAI,EAAE,eAAe;QACrB,GAAG,EAAE,KAAK,CAAC,GAAG;QACd,QAAQ,EAAE,KAAK,CAAC,QAAQ;KACzB,CAAC,CAAC;AACL,CAAC","sourcesContent":["import type { AgentEvidenceCallback } from \"../../types/public/agentEvidenceEvents.js\";\nimport type { LogLine } from \"../../types/public/logs.js\";\nimport type { V3 } from \"../../v3.js\";\nimport { captureAriaTreeProbe } from \"./captureAriaTreeProbe.js\";\n\ninterface CaptureProbeEvidenceOptions {\n v3: V3;\n url: string;\n logger: (message: LogLine) => void;\n warningMessage: string;\n}\n\ninterface EmitPostStepProbeEvidenceOptions extends CaptureProbeEvidenceOptions {\n evidenceCallback?: AgentEvidenceCallback;\n}\n\nfunction errorMessage(error: unknown): string {\n return error instanceof Error ? error.message : String(error);\n}\n\nexport async function captureProbeEvidence({\n v3,\n url,\n logger,\n warningMessage,\n}: CaptureProbeEvidenceOptions): Promise<{\n url: string;\n screenshot?: Buffer;\n ariaTree?: string;\n}> {\n let probeUrl = url;\n let screenshot: Buffer | undefined;\n try {\n const page = await v3.context.awaitActivePage();\n probeUrl = page.url();\n screenshot = await page.screenshot({ fullPage: false });\n } catch (e) {\n logger({\n category: \"agent\",\n message: `${warningMessage}: ${errorMessage(e)}`,\n level: 1,\n });\n }\n\n const ariaTree = await captureAriaTreeProbe(v3);\n return {\n url: probeUrl,\n ...(screenshot ? { screenshot } : {}),\n ...(ariaTree !== undefined ? { ariaTree } : {}),\n };\n}\n\nexport async function emitPostStepProbeEvidence({\n v3,\n url,\n evidenceCallback,\n logger,\n warningMessage,\n}: EmitPostStepProbeEvidenceOptions): Promise<void> {\n if (!evidenceCallback) return;\n\n const probe = await captureProbeEvidence({\n v3,\n url,\n logger,\n warningMessage,\n });\n if (probe.screenshot) {\n await evidenceCallback({\n type: \"screenshot\",\n screenshot: probe.screenshot,\n url: probe.url,\n evidenceRole: \"probe\",\n });\n }\n await evidenceCallback({\n type: \"step_observed\",\n url: probe.url,\n ariaTree: probe.ariaTree,\n });\n}\n"]}

package/dist/esm/lib/v3/agent/utils/toolOutputEvidence.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import type { AgentStepFinishedEvent } from "../../types/public/agentEvidenceEvents.js";
2	+ export declare function inferToolOutput(toolResult: unknown): AgentStepFinishedEvent["toolOutput"];

package/dist/esm/lib/v3/agent/utils/toolOutputEvidence.js ADDED Viewed

@@ -0,0 +1,59 @@
+const ERROR_STRING_LIMIT = 1000;
+function isRecord(value) {
+    return value !== null && typeof value === "object" && !Array.isArray(value);
+}
+function hasOwn(value, key) {
+    return Object.prototype.hasOwnProperty.call(value, key);
+}
+function normalizeError(value) {
+    if (value === undefined || value === null || value === false) {
+        return undefined;
+    }
+    if (value instanceof Error) {
+        return value.message;
+    }
+    if (typeof value === "string") {
+        return value;
+    }
+    if (typeof value === "number" ||
+        typeof value === "boolean" ||
+        typeof value === "bigint") {
+        return String(value);
+    }
+    let serialized;
+    try {
+        serialized = JSON.stringify(value) ?? String(value);
+    }
+    catch {
+        serialized = String(value);
+    }
+    if (serialized.length <= ERROR_STRING_LIMIT) {
+        return serialized;
+    }
+    return `${serialized.slice(0, ERROR_STRING_LIMIT)}... [truncated]`;
+}
+function statusCandidates(toolResult) {
+    if (!isRecord(toolResult)) {
+        return [];
+    }
+    const candidates = [toolResult];
+    const output = toolResult.output;
+    if (isRecord(output)) {
+        candidates.push(output);
+    }
+    return candidates;
+}
+export function inferToolOutput(toolResult) {
+    const candidates = statusCandidates(toolResult);
+    const error = candidates
+        .map((candidate) => hasOwn(candidate, "error") ? normalizeError(candidate.error) : undefined)
+        .find((message) => message !== undefined);
+    const successFalse = candidates.some((candidate) => candidate.success === false);
+    const isError = candidates.some((candidate) => Boolean(candidate.isError));
+    return {
+        ok: error === undefined && !isError && !successFalse,
+        result: toolResult,
+        error,
+    };
+}
+//# sourceMappingURL=toolOutputEvidence.js.map

package/dist/esm/lib/v3/agent/utils/toolOutputEvidence.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"toolOutputEvidence.js","sourceRoot":"","sources":["../../../../../../lib/v3/agent/utils/toolOutputEvidence.ts"],"names":[],"mappings":"AAEA,MAAM,kBAAkB,GAAG,IAAI,CAAC;AAEhC,SAAS,QAAQ,CAAC,KAAc;IAC9B,OAAO,KAAK,KAAK,IAAI,IAAI,OAAO,KAAK,KAAK,QAAQ,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;AAC9E,CAAC;AAED,SAAS,MAAM,CAAC,KAA8B,EAAE,GAAW;IACzD,OAAO,MAAM,CAAC,SAAS,CAAC,cAAc,CAAC,IAAI,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;AAC1D,CAAC;AAED,SAAS,cAAc,CAAC,KAAc;IACpC,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,KAAK,IAAI,IAAI,KAAK,KAAK,KAAK,EAAE,CAAC;QAC7D,OAAO,SAAS,CAAC;IACnB,CAAC;IACD,IAAI,KAAK,YAAY,KAAK,EAAE,CAAC;QAC3B,OAAO,KAAK,CAAC,OAAO,CAAC;IACvB,CAAC;IACD,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,OAAO,KAAK,CAAC;IACf,CAAC;IACD,IACE,OAAO,KAAK,KAAK,QAAQ;QACzB,OAAO,KAAK,KAAK,SAAS;QAC1B,OAAO,KAAK,KAAK,QAAQ,EACzB,CAAC;QACD,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC;IACvB,CAAC;IAED,IAAI,UAAkB,CAAC;IACvB,IAAI,CAAC;QACH,UAAU,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC;IACtD,CAAC;IAAC,MAAM,CAAC;QACP,UAAU,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;IAC7B,CAAC;IACD,IAAI,UAAU,CAAC,MAAM,IAAI,kBAAkB,EAAE,CAAC;QAC5C,OAAO,UAAU,CAAC;IACpB,CAAC;IACD,OAAO,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,kBAAkB,CAAC,iBAAiB,CAAC;AACrE,CAAC;AAED,SAAS,gBAAgB,CAAC,UAAmB;IAC3C,IAAI,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;QAC1B,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,UAAU,GAAG,CAAC,UAAU,CAAC,CAAC;IAChC,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC;IACjC,IAAI,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;QACrB,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAC1B,CAAC;IACD,OAAO,UAAU,CAAC;AACpB,CAAC;AAED,MAAM,UAAU,eAAe,CAC7B,UAAmB;IAEnB,MAAM,UAAU,GAAG,gBAAgB,CAAC,UAAU,CAAC,CAAC;IAChD,MAAM,KAAK,GAAG,UAAU;SACrB,GAAG,CAAC,CAAC,SAAS,EAAE,EAAE,CACjB,MAAM,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,SAAS,CACzE;SACA,IAAI,CAAC,CAAC,OAAO,EAAqB,EAAE,CAAC,OAAO,KAAK,SAAS,CAAC,CAAC;IAE/D,MAAM,YAAY,GAAG,UAAU,CAAC,IAAI,CAClC,CAAC,SAAS,EAAE,EAAE,CAAC,SAAS,CAAC,OAAO,KAAK,KAAK,CAC3C,CAAC;IACF,MAAM,OAAO,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC,SAAS,EAAE,EAAE,CAAC,OAAO,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC;IAE3E,OAAO;QACL,EAAE,EAAE,KAAK,KAAK,SAAS,IAAI,CAAC,OAAO,IAAI,CAAC,YAAY;QACpD,MAAM,EAAE,UAAU;QAClB,KAAK;KACN,CAAC;AACJ,CAAC","sourcesContent":["import type { AgentStepFinishedEvent } from \"../../types/public/agentEvidenceEvents.js\";\n\nconst ERROR_STRING_LIMIT = 1000;\n\nfunction isRecord(value: unknown): value is Record<string, unknown> {\n return value !== null && typeof value === \"object\" && !Array.isArray(value);\n}\n\nfunction hasOwn(value: Record<string, unknown>, key: string): boolean {\n return Object.prototype.hasOwnProperty.call(value, key);\n}\n\nfunction normalizeError(value: unknown): string | undefined {\n if (value === undefined || value === null || value === false) {\n return undefined;\n }\n if (value instanceof Error) {\n return value.message;\n }\n if (typeof value === \"string\") {\n return value;\n }\n if (\n typeof value === \"number\" ||\n typeof value === \"boolean\" ||\n typeof value === \"bigint\"\n ) {\n return String(value);\n }\n\n let serialized: string;\n try {\n serialized = JSON.stringify(value) ?? String(value);\n } catch {\n serialized = String(value);\n }\n if (serialized.length <= ERROR_STRING_LIMIT) {\n return serialized;\n }\n return `${serialized.slice(0, ERROR_STRING_LIMIT)}... [truncated]`;\n}\n\nfunction statusCandidates(toolResult: unknown): Record<string, unknown>[] {\n if (!isRecord(toolResult)) {\n return [];\n }\n\n const candidates = [toolResult];\n const output = toolResult.output;\n if (isRecord(output)) {\n candidates.push(output);\n }\n return candidates;\n}\n\nexport function inferToolOutput(\n toolResult: unknown,\n): AgentStepFinishedEvent[\"toolOutput\"] {\n const candidates = statusCandidates(toolResult);\n const error = candidates\n .map((candidate) =>\n hasOwn(candidate, \"error\") ? normalizeError(candidate.error) : undefined,\n )\n .find((message): message is string => message !== undefined);\n\n const successFalse = candidates.some(\n (candidate) => candidate.success === false,\n );\n const isError = candidates.some((candidate) => Boolean(candidate.isError));\n\n return {\n ok: error === undefined && !isError && !successFalse,\n result: toolResult,\n error,\n };\n}\n"]}

package/dist/esm/lib/v3/agent/utils/wrapEvidenceCallback.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+import type { AgentEvidenceCallback } from "../../types/public/agentEvidenceEvents.js";
+import type { LogLine } from "../../types/public/logs.js";
+export declare function wrapEvidenceCallback(callback: AgentEvidenceCallback | undefined, logger: (message: LogLine) => void): AgentEvidenceCallback | undefined;

package/dist/esm/lib/v3/agent/utils/wrapEvidenceCallback.js ADDED Viewed

@@ -0,0 +1,22 @@
+// onEvidence is a user-supplied observability hook (trajectory recording,
+// verifier capture, etc.). Wrap it once at the boundary where the handler
+// receives it so a throwing user callback can never abort the agent loop —
+// internal emit sites can then call the wrapped callback directly without
+// per-site try/catch.
+export function wrapEvidenceCallback(callback, logger) {
+    if (!callback)
+        return undefined;
+    return async (event) => {
+        try {
+            await callback(event);
+        }
+        catch (e) {
+            logger({
+                category: "agent",
+                message: `Warning: onEvidence callback failed for ${event.type}: ${e instanceof Error ? e.message : String(e)}`,
+                level: 1,
+            });
+        }
+    };
+}
+//# sourceMappingURL=wrapEvidenceCallback.js.map

package/dist/esm/lib/v3/agent/utils/wrapEvidenceCallback.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"wrapEvidenceCallback.js","sourceRoot":"","sources":["../../../../../../lib/v3/agent/utils/wrapEvidenceCallback.ts"],"names":[],"mappings":"AAGA,0EAA0E;AAC1E,0EAA0E;AAC1E,2EAA2E;AAC3E,0EAA0E;AAC1E,sBAAsB;AACtB,MAAM,UAAU,oBAAoB,CAClC,QAA2C,EAC3C,MAAkC;IAElC,IAAI,CAAC,QAAQ;QAAE,OAAO,SAAS,CAAC;IAChC,OAAO,KAAK,EAAE,KAAK,EAAE,EAAE;QACrB,IAAI,CAAC;YACH,MAAM,QAAQ,CAAC,KAAK,CAAC,CAAC;QACxB,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,MAAM,CAAC;gBACL,QAAQ,EAAE,OAAO;gBACjB,OAAO,EAAE,2CAA2C,KAAK,CAAC,IAAI,KAC5D,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAC3C,EAAE;gBACF,KAAK,EAAE,CAAC;aACT,CAAC,CAAC;QACL,CAAC;IACH,CAAC,CAAC;AACJ,CAAC","sourcesContent":["import type { AgentEvidenceCallback } from \"../../types/public/agentEvidenceEvents.js\";\nimport type { LogLine } from \"../../types/public/logs.js\";\n\n// onEvidence is a user-supplied observability hook (trajectory recording,\n// verifier capture, etc.). Wrap it once at the boundary where the handler\n// receives it so a throwing user callback can never abort the agent loop —\n// internal emit sites can then call the wrapped callback directly without\n// per-site try/catch.\nexport function wrapEvidenceCallback(\n callback: AgentEvidenceCallback | undefined,\n logger: (message: LogLine) => void,\n): AgentEvidenceCallback | undefined {\n if (!callback) return undefined;\n return async (event) => {\n try {\n await callback(event);\n } catch (e) {\n logger({\n category: \"agent\",\n message: `Warning: onEvidence callback failed for ${event.type}: ${\n e instanceof Error ? e.message : String(e)\n }`,\n level: 1,\n });\n }\n };\n}\n"]}

package/dist/esm/lib/v3/api.d.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import type { ActResult, AgentConfig, AgentExecuteOptions, AgentResult, ExtractResult, ObserveResult, LogLine, StagehandMetrics, BrowserbaseRegion, ActOptions, ExtractOptions, ObserveOptions, Api } from "./types/public/index.js";
 import type { SerializableResponse, AgentCacheTransferPayload } from "./types/private/index.js";
+import type { ModelConfiguration } from "./types/public/model.js";
 import type { StagehandZodSchema } from "./zodCompat.js";
 /**
  * Mapping of Browserbase regions to their corresponding Stagehand API base URLs.
@@ -41,6 +42,8 @@ interface ClientSessionStartParams extends Api.SessionStartRequest {
      *  Optional: when omitted, requests are sent without the x-model-api-key header
      *  and the server is expected to handle model authentication on its own. */
     modelApiKey?: string;
+    /** Default model config for later action requests. Not sent to /sessions/start. */
+    defaultModelConfig?: ModelConfiguration;
 }
 /**
  * Client parameters for act() method.
@@ -79,6 +82,7 @@ export declare class StagehandAPIClient {
     private sessionId?;
     private modelApiKey?;
     private modelProvider?;
+    private defaultModelConfig?;
     private region?;
     private logger;
     private fetchWithCookies;
@@ -87,7 +91,7 @@ export declare class StagehandAPIClient {
     private latestAgentCacheEntry;
     private warnedStagehandBaseUrl;
     constructor({ apiKey, projectId, logger, serverCache, }: StagehandAPIConstructorParams);
-    init({ modelName, modelApiKey, domSettleTimeoutMs, verbose, systemPrompt, selfHeal, browserbaseSessionCreateParams, browserbaseSessionID, }: ClientSessionStartParams): Promise<Api.SessionStartResult>;
+    init({ modelName, modelApiKey, defaultModelConfig, domSettleTimeoutMs, verbose, systemPrompt, selfHeal, browserbaseSessionCreateParams, browserbaseSessionID, }: ClientSessionStartParams): Promise<Api.SessionStartResult>;
     act({ input, options, frameId, }: ClientActParameters): Promise<ActResult>;
     extract<T extends StagehandZodSchema>({ instruction, schema: zodSchema, options, frameId, }: ClientExtractParameters): Promise<ExtractResult<T>>;
     observe({ instruction, options, frameId, }: ClientObserveParameters): Promise<ObserveResult>;
@@ -105,6 +109,8 @@ export declare class StagehandAPIClient {
      * model provider differs from the one used to init the session.
      */
     private prepareModelConfig;
+    private getDefaultModelConfig;
+    private getModelProvider;
     private consumeFinishedEventData;
     private execute;
     /**

package/dist/esm/lib/v3/api.js CHANGED Viewed

@@ -35,6 +35,7 @@ export class StagehandAPIClient {
     sessionId;
     modelApiKey;
     modelProvider;
+    defaultModelConfig;
     region;
     logger;
     fetchWithCookies;
@@ -50,7 +51,7 @@ export class StagehandAPIClient {
         // Create a single cookie jar instance that will persist across all requests
         this.fetchWithCookies = makeFetchCookie(fetch);
     }
-    async init({ modelName, modelApiKey, domSettleTimeoutMs, verbose, systemPrompt, selfHeal, browserbaseSessionCreateParams, browserbaseSessionID,
+    async init({ modelName, modelApiKey, defaultModelConfig, domSettleTimeoutMs, verbose, systemPrompt, selfHeal, browserbaseSessionCreateParams, browserbaseSessionID,
     // browser,  TODO for local browsers
      }) {
         this.modelApiKey = modelApiKey;
@@ -58,6 +59,9 @@ export class StagehandAPIClient {
         this.modelProvider = modelName?.includes("/")
             ? modelName.split("/")[0]
             : undefined;
+        this.defaultModelConfig = defaultModelConfig
+            ? this.prepareModelConfig(defaultModelConfig)
+            : undefined;
         // Store the region for multi-region API URL resolution
         this.region = browserbaseSessionCreateParams?.region;
         this.logger({
@@ -111,13 +115,21 @@ export class StagehandAPIClient {
             // eslint-disable-next-line @typescript-eslint/no-unused-vars
             const { page: _, serverCache: enableCache, ...restOptions } = options;
             serverCache = enableCache;
+            if (restOptions.model) {
+                restOptions.model = this.prepareModelConfig(restOptions.model);
+            }
+            else if (this.defaultModelConfig) {
+                restOptions.model = this.getDefaultModelConfig();
+            }
             if (Object.keys(restOptions).length > 0) {
-                if (restOptions.model) {
-                    restOptions.model = this.prepareModelConfig(restOptions.model);
-                }
                 wireOptions = restOptions;
             }
         }
+        else if (this.defaultModelConfig) {
+            wireOptions = {
+                model: this.getDefaultModelConfig(),
+            };
+        }
         // Build wire-format request body
         const requestBody = {
             input,
@@ -140,13 +152,21 @@ export class StagehandAPIClient {
             // eslint-disable-next-line @typescript-eslint/no-unused-vars
             const { page: _, serverCache: enableCache, ...restOptions } = options;
             serverCache = enableCache;
+            if (restOptions.model) {
+                restOptions.model = this.prepareModelConfig(restOptions.model);
+            }
+            else if (this.defaultModelConfig) {
+                restOptions.model = this.getDefaultModelConfig();
+            }
             if (Object.keys(restOptions).length > 0) {
-                if (restOptions.model) {
-                    restOptions.model = this.prepareModelConfig(restOptions.model);
-                }
                 wireOptions = restOptions;
             }
         }
+        else if (this.defaultModelConfig) {
+            wireOptions = {
+                model: this.getDefaultModelConfig(),
+            };
+        }
         // Build wire-format request body
         const requestBody = {
             instruction,
@@ -168,13 +188,21 @@ export class StagehandAPIClient {
             // eslint-disable-next-line @typescript-eslint/no-unused-vars
             const { page: _, serverCache: enableCache, ...restOptions } = options;
             serverCache = enableCache;
+            if (restOptions.model) {
+                restOptions.model = this.prepareModelConfig(restOptions.model);
+            }
+            else if (this.defaultModelConfig) {
+                restOptions.model = this.getDefaultModelConfig();
+            }
             if (Object.keys(restOptions).length > 0) {
-                if (restOptions.model) {
-                    restOptions.model = this.prepareModelConfig(restOptions.model);
-                }
                 wireOptions = restOptions;
             }
         }
+        else if (this.defaultModelConfig) {
+            wireOptions = {
+                model: this.getDefaultModelConfig(),
+            };
+        }
         // Build wire-format request body
         const requestBody = {
             instruction,
@@ -188,7 +216,19 @@ export class StagehandAPIClient {
         });
     }
     async goto(url, options, frameId) {
-        const requestBody = { url, options, frameId };
+        const publicOptions = { ...(options ?? {}) };
+        delete publicOptions.model;
+        const wireOptions = {
+            ...publicOptions,
+            ...(this.defaultModelConfig
+                ? { model: this.getDefaultModelConfig() }
+                : {}),
+        };
+        const requestBody = {
+            url,
+            options: Object.keys(wireOptions).length > 0 ? wireOptions : undefined,
+            frameId,
+        };
         return this.execute({
             method: "navigate",
             args: requestBody,
@@ -218,7 +258,7 @@ export class StagehandAPIClient {
             cua: agentConfig.mode === undefined ? agentConfig.cua : undefined,
             model: agentConfig.model
                 ? this.prepareModelConfig(agentConfig.model)
-                : undefined,
+                : this.getDefaultModelConfig(),
             executionModel: agentConfig.executionModel
                 ? this.prepareModelConfig(agentConfig.executionModel)
                 : undefined,
@@ -374,28 +414,41 @@ export class StagehandAPIClient {
     prepareModelConfig(model) {
         if (typeof model === "string") {
             // Extract provider from model string (e.g., "openai/gpt-5-nano" -> "openai")
-            const provider = model.includes("/") ? model.split("/")[0] : undefined;
+            const provider = this.getModelProvider(model);
+            const inheritedDefault = provider && provider === this.modelProvider
+                ? this.getDefaultModelConfig()
+                : undefined;
             const apiKey = provider && provider !== this.modelProvider
                 ? (loadApiKeyFromEnv(provider, this.logger) ?? this.modelApiKey)
                 : this.modelApiKey;
             return {
+                ...inheritedDefault,
                 modelName: model,
                 ...(apiKey ? { apiKey } : {}),
             };
         }
-        if (!model.apiKey) {
-            const provider = model.modelName?.includes("/")
-                ? model.modelName.split("/")[0]
+        const provider = this.getModelProvider(model.modelName);
+        const inheritedDefault = provider && provider === this.modelProvider
+            ? this.getDefaultModelConfig()
+            : undefined;
+        const apiKey = !model.apiKey && provider && provider !== this.modelProvider
+            ? (loadApiKeyFromEnv(provider, this.logger) ?? this.modelApiKey)
+            : !model.apiKey
+                ? this.modelApiKey
                 : undefined;
-            const apiKey = provider && provider !== this.modelProvider
-                ? (loadApiKeyFromEnv(provider, this.logger) ?? this.modelApiKey)
-                : this.modelApiKey;
-            return {
-                ...model,
-                ...(apiKey ? { apiKey } : {}),
-            };
-        }
-        return model;
+        return {
+            ...inheritedDefault,
+            ...model,
+            ...(apiKey ? { apiKey } : {}),
+        };
+    }
+    getDefaultModelConfig() {
+        return this.defaultModelConfig
+            ? { ...this.defaultModelConfig }
+            : undefined;
+    }
+    getModelProvider(modelName) {
+        return modelName?.includes("/") ? modelName.split("/")[0] : undefined;
     }
     consumeFinishedEventData() {
         const data = this.lastFinishedEventData;
@@ -577,12 +630,30 @@ export class StagehandAPIClient {
         else {
             baseUrl = getApiUrlForRegion(this.region);
         }
+        const headers = {
+            ...defaultHeaders,
+            ...options.headers,
+        };
+        if (path.endsWith("/navigate")) {
+            let body = options.body;
+            if (typeof options.body === "string") {
+                try {
+                    body = JSON.parse(options.body);
+                }
+                catch {
+                    body = options.body;
+                }
+            }
+            console.log("Stagehand goto request", JSON.stringify({
+                url: `${baseUrl}${path}`,
+                method: options.method,
+                headers,
+                body,
+            }, null, 2));
+        }
         const response = await this.fetchWithCookies(`${baseUrl}${path}`, {
             ...options,
-            headers: {
-                ...defaultHeaders,
-                ...options.headers,
-            },
+            headers,
         });
         return response;
     }