@browserbasehq/orca 3.4.0-preview-5 → 3.5.0-preview.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/dist/cjs/lib/inference.d.ts +2 -1
  2. package/dist/cjs/lib/inference.js +10 -3
  3. package/dist/cjs/lib/inference.js.map +1 -1
  4. package/dist/cjs/lib/prompt.d.ts +2 -2
  5. package/dist/cjs/lib/prompt.js +19 -4
  6. package/dist/cjs/lib/prompt.js.map +1 -1
  7. package/dist/cjs/lib/v3/handlers/extractHandler.js +19 -2
  8. package/dist/cjs/lib/v3/handlers/extractHandler.js.map +1 -1
  9. package/dist/cjs/lib/v3/index.d.ts +1 -0
  10. package/dist/cjs/lib/v3/index.js.map +1 -1
  11. package/dist/cjs/lib/v3/types/private/handlers.d.ts +1 -0
  12. package/dist/cjs/lib/v3/types/private/handlers.js.map +1 -1
  13. package/dist/cjs/lib/v3/types/public/api.d.ts +2 -0
  14. package/dist/cjs/lib/v3/types/public/api.js +4 -0
  15. package/dist/cjs/lib/v3/types/public/api.js.map +1 -1
  16. package/dist/cjs/lib/v3/types/public/methods.d.ts +1 -0
  17. package/dist/cjs/lib/v3/types/public/methods.js.map +1 -1
  18. package/dist/cjs/lib/v3/v3.js +2 -0
  19. package/dist/cjs/lib/v3/v3.js.map +1 -1
  20. package/dist/cjs/lib/v3Evaluator.d.ts +20 -12
  21. package/dist/cjs/lib/v3Evaluator.js +41 -199
  22. package/dist/cjs/lib/v3Evaluator.js.map +1 -1
  23. package/dist/cjs/lib/v3LegacyEvaluator.d.ts +20 -0
  24. package/dist/cjs/lib/v3LegacyEvaluator.js +215 -0
  25. package/dist/cjs/lib/v3LegacyEvaluator.js.map +1 -0
  26. package/dist/esm/lib/inference.d.ts +2 -1
  27. package/dist/esm/lib/inference.js +10 -3
  28. package/dist/esm/lib/inference.js.map +1 -1
  29. package/dist/esm/lib/prompt.d.ts +2 -2
  30. package/dist/esm/lib/prompt.js +19 -4
  31. package/dist/esm/lib/prompt.js.map +1 -1
  32. package/dist/esm/lib/v3/handlers/extractHandler.js +19 -2
  33. package/dist/esm/lib/v3/handlers/extractHandler.js.map +1 -1
  34. package/dist/esm/lib/v3/index.d.ts +1 -0
  35. package/dist/esm/lib/v3/index.js.map +1 -1
  36. package/dist/esm/lib/v3/types/private/handlers.d.ts +1 -0
  37. package/dist/esm/lib/v3/types/private/handlers.js.map +1 -1
  38. package/dist/esm/lib/v3/types/public/api.d.ts +2 -0
  39. package/dist/esm/lib/v3/types/public/api.js +4 -0
  40. package/dist/esm/lib/v3/types/public/api.js.map +1 -1
  41. package/dist/esm/lib/v3/types/public/methods.d.ts +1 -0
  42. package/dist/esm/lib/v3/types/public/methods.js.map +1 -1
  43. package/dist/esm/lib/v3/v3.js +2 -0
  44. package/dist/esm/lib/v3/v3.js.map +1 -1
  45. package/dist/esm/lib/v3Evaluator.d.ts +20 -12
  46. package/dist/esm/lib/v3Evaluator.js +41 -199
  47. package/dist/esm/lib/v3Evaluator.js.map +1 -1
  48. package/dist/esm/lib/v3LegacyEvaluator.d.ts +20 -0
  49. package/dist/esm/lib/v3LegacyEvaluator.js +211 -0
  50. package/dist/esm/lib/v3LegacyEvaluator.js.map +1 -0
  51. package/package.json +3 -3
  52. package/dist/cjs/lib/v3/dom/build/selectorRuntime.generated.d.ts +0 -24
  53. package/dist/cjs/lib/v3/dom/build/selectorRuntime.generated.js +0 -31
  54. package/dist/cjs/lib/v3/dom/build/selectorRuntime.generated.js.map +0 -1
  55. package/dist/esm/lib/v3/dom/build/selectorRuntime.generated.d.ts +0 -24
  56. package/dist/esm/lib/v3/dom/build/selectorRuntime.generated.js +0 -28
  57. package/dist/esm/lib/v3/dom/build/selectorRuntime.generated.js.map +0 -1
@@ -1,19 +1,27 @@
1
- /**
2
- * V3Evaluator mirrors Evaluator but operates on a V3 instance instead of Stagehand.
3
- * It uses the V3 page/screenshot APIs and constructs an LLM client to run
4
- * structured evaluations (YES/NO with reasoning) on screenshots and/or text.
5
- */
6
1
  import type { AvailableModel, ClientOptions } from "./v3/types/public/model.js";
7
2
  import type { EvaluateOptions, BatchAskOptions, EvaluationResult } from "./v3/types/private/evaluator.js";
8
3
  import { V3 } from "./v3/v3.js";
4
+ export type V3EvaluatorBackend = "legacy" | "verifier";
5
+ export type V3EvaluatorOptions = {
6
+ /**
7
+ * Selects the evaluator implementation.
8
+ *
9
+ * "legacy" preserves the existing screenshot/text YES/NO evaluator.
10
+ * "verifier" is reserved for the rubric verifier backend.
11
+ *
12
+ * @default process.env.STAGEHAND_EVALUATOR_BACKEND || "legacy"
13
+ */
14
+ backend?: V3EvaluatorBackend;
15
+ };
16
+ export type V3EvaluatorConstructorOptions = V3EvaluatorOptions & {
17
+ modelName?: AvailableModel;
18
+ modelClientOptions?: ClientOptions;
19
+ };
9
20
  export declare class V3Evaluator {
10
- private v3;
11
- private modelName;
12
- private modelClientOptions;
13
- private silentLogger;
14
- constructor(v3: V3, modelName?: AvailableModel, modelClientOptions?: ClientOptions);
15
- private getClient;
21
+ private readonly backend;
22
+ private readonly legacyEvaluator;
23
+ constructor(v3: V3, modelNameOrOptions?: AvailableModel | V3EvaluatorConstructorOptions, modelClientOptions?: ClientOptions, options?: V3EvaluatorOptions);
16
24
  ask(options: EvaluateOptions): Promise<EvaluationResult>;
17
25
  batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]>;
18
- private _evaluateWithMultipleScreenshots;
26
+ private getLegacyBackend;
19
27
  }
@@ -1,210 +1,52 @@
1
- /**
2
- * V3Evaluator mirrors Evaluator but operates on a V3 instance instead of Stagehand.
3
- * It uses the V3 page/screenshot APIs and constructs an LLM client to run
4
- * structured evaluations (YES/NO with reasoning) on screenshots and/or text.
5
- */
6
- import { z } from "zod";
7
- import { LLMProvider } from "./v3/llm/LLMProvider.js";
8
1
  import { StagehandInvalidArgumentError } from "./v3/types/public/sdkErrors.js";
9
- const EvaluationSchema = z.object({
10
- evaluation: z.enum(["YES", "NO"]),
11
- reasoning: z.string(),
12
- });
13
- const BatchEvaluationSchema = z.array(EvaluationSchema);
2
+ import { LegacyV3Evaluator } from "./v3LegacyEvaluator.js";
3
+ const EVALUATOR_BACKEND_ENV = "STAGEHAND_EVALUATOR_BACKEND";
4
+ const DEFAULT_EVALUATOR_BACKEND = "legacy";
14
5
  export class V3Evaluator {
15
- v3;
16
- modelName;
17
- modelClientOptions;
18
- silentLogger = () => { };
19
- constructor(v3, modelName, modelClientOptions) {
20
- this.v3 = v3;
21
- this.modelName = modelName || "google/gemini-2.5-flash";
22
- this.modelClientOptions = modelClientOptions || {
23
- apiKey: process.env.GEMINI_API_KEY ||
24
- process.env.GOOGLE_GENERATIVE_AI_API_KEY ||
25
- "",
26
- };
27
- }
28
- getClient() {
29
- // Prefer a dedicated provider so we can override model per-evaluation
30
- const provider = new LLMProvider(this.v3.logger);
31
- return provider.getClient(this.modelName, this.modelClientOptions);
6
+ backend;
7
+ legacyEvaluator;
8
+ constructor(v3, modelNameOrOptions, modelClientOptions, options) {
9
+ const normalizedOptions = normalizeConstructorOptions(modelNameOrOptions, modelClientOptions, options);
10
+ this.backend = resolveEvaluatorBackend(normalizedOptions.backend);
11
+ this.legacyEvaluator = new LegacyV3Evaluator(v3, normalizedOptions.modelName, normalizedOptions.modelClientOptions);
32
12
  }
33
13
  async ask(options) {
34
- const { question, answer, screenshot = true, systemPrompt, screenshotDelayMs = 250, agentReasoning, } = options;
35
- if (!question)
36
- throw new StagehandInvalidArgumentError("Question cannot be an empty string");
37
- if (!answer && !screenshot)
38
- throw new StagehandInvalidArgumentError("Either answer (text) or screenshot must be provided");
39
- if (Array.isArray(screenshot)) {
40
- return this._evaluateWithMultipleScreenshots({
41
- question,
42
- screenshots: screenshot,
43
- systemPrompt,
44
- agentReasoning,
45
- });
46
- }
47
- const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO based on if the original goal was achieved. You have access to ${screenshot ? "a screenshot" : "the agents reasoning and actions throughout the task"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.\n Today's date is ${new Date().toLocaleDateString()}`;
48
- await new Promise((r) => setTimeout(r, screenshotDelayMs));
49
- let imageBuffer;
50
- if (screenshot) {
51
- const page = await this.v3.context.awaitActivePage();
52
- imageBuffer = await page.screenshot({ fullPage: false });
53
- }
54
- const llmClient = this.getClient();
55
- const response = await llmClient.createChatCompletion({
56
- logger: this.silentLogger,
57
- options: {
58
- messages: [
59
- { role: "system", content: systemPrompt || defaultSystemPrompt },
60
- {
61
- role: "user",
62
- content: [
63
- {
64
- type: "text",
65
- text: agentReasoning
66
- ? `Question: ${question}\n\nAgent's reasoning and actions taken:\n${agentReasoning}`
67
- : question,
68
- },
69
- ...(screenshot && imageBuffer
70
- ? [
71
- {
72
- type: "image_url",
73
- image_url: {
74
- url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
75
- },
76
- },
77
- ]
78
- : []),
79
- ...(answer
80
- ? [{ type: "text", text: `the answer is ${answer}` }]
81
- : []),
82
- ],
83
- },
84
- ],
85
- response_model: { name: "EvaluationResult", schema: EvaluationSchema },
86
- },
87
- });
88
- try {
89
- const result = response.data;
90
- return { evaluation: result.evaluation, reasoning: result.reasoning };
91
- }
92
- catch (error) {
93
- const errorMessage = error instanceof Error ? error.message : String(error);
94
- return {
95
- evaluation: "INVALID",
96
- reasoning: `Failed to get structured response: ${errorMessage}`,
97
- };
98
- }
14
+ return this.getLegacyBackend("ask").ask(options);
99
15
  }
100
16
  async batchAsk(options) {
101
- const { questions, screenshot = true, systemPrompt = "You are an expert evaluator that returns YES or NO with a concise reasoning.", screenshotDelayMs = 250, } = options;
102
- if (!questions?.length)
103
- throw new StagehandInvalidArgumentError("Questions array cannot be empty");
104
- await new Promise((r) => setTimeout(r, screenshotDelayMs));
105
- let imageBuffer;
106
- if (screenshot) {
107
- const page = await this.v3.context.awaitActivePage();
108
- imageBuffer = await page.screenshot({ fullPage: false });
109
- }
110
- const llmClient = this.getClient();
111
- const formatted = questions
112
- .map((item, i) => `${i + 1}. ${item.question}${item.answer ? `\n Answer: ${item.answer}` : ""}`)
113
- .join("\n\n");
114
- const response = await llmClient.createChatCompletion({
115
- logger: this.silentLogger,
116
- options: {
117
- messages: [
118
- {
119
- role: "system",
120
- content: `${systemPrompt}\n\nYou will be given multiple questions${screenshot ? " with a screenshot" : ""}. ${questions.some((q) => q.answer) ? "Some questions include answers to evaluate." : ""} Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,
121
- },
122
- {
123
- role: "user",
124
- content: [
125
- { type: "text", text: formatted },
126
- ...(screenshot && imageBuffer
127
- ? [
128
- {
129
- type: "image_url",
130
- image_url: {
131
- url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
132
- },
133
- },
134
- ]
135
- : []),
136
- ],
137
- },
138
- ],
139
- response_model: {
140
- name: "BatchEvaluationResult",
141
- schema: BatchEvaluationSchema,
142
- },
143
- },
144
- });
145
- try {
146
- const results = response.data;
147
- return results.map((r) => ({
148
- evaluation: r.evaluation,
149
- reasoning: r.reasoning,
150
- }));
151
- }
152
- catch (error) {
153
- const errorMessage = error instanceof Error ? error.message : String(error);
154
- return questions.map(() => ({
155
- evaluation: "INVALID",
156
- reasoning: `Failed to get structured response: ${errorMessage}`,
157
- }));
158
- }
17
+ return this.getLegacyBackend("batchAsk").batchAsk(options);
159
18
  }
160
- async _evaluateWithMultipleScreenshots(options) {
161
- const { question, screenshots, agentReasoning, systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
162
- ${agentReasoning ? "You also have access to the agent's detailed reasoning and thought process throughout the task." : ""}
163
- Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.
164
- Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).
165
- ${agentReasoning ? "The agent's reasoning provides crucial context about what actions were attempted, what was observed, and the decision-making process. Use this alongside the visual evidence to make a comprehensive evaluation." : ""}
166
- Today's date is ${new Date().toLocaleDateString()}`, } = options;
167
- if (!question)
168
- throw new StagehandInvalidArgumentError("Question cannot be an empty string");
169
- if (!screenshots || screenshots.length === 0)
170
- throw new StagehandInvalidArgumentError("At least one screenshot must be provided");
171
- const llmClient = this.getClient();
172
- const imageContents = screenshots.map((s) => ({
173
- type: "image_url",
174
- image_url: { url: `data:image/jpeg;base64,${s.toString("base64")}` },
175
- }));
176
- const response = await llmClient.createChatCompletion({
177
- logger: this.silentLogger,
178
- options: {
179
- messages: [
180
- { role: "system", content: systemPrompt },
181
- {
182
- role: "user",
183
- content: [
184
- {
185
- type: "text",
186
- text: agentReasoning
187
- ? `Question: ${question}\n\nAgent's reasoning and actions throughout the task:\n${agentReasoning}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`
188
- : `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
189
- },
190
- ...imageContents,
191
- ],
192
- },
193
- ],
194
- response_model: { name: "EvaluationResult", schema: EvaluationSchema },
195
- },
196
- });
197
- try {
198
- const result = response.data;
199
- return { evaluation: result.evaluation, reasoning: result.reasoning };
200
- }
201
- catch (error) {
202
- const errorMessage = error instanceof Error ? error.message : String(error);
203
- return {
204
- evaluation: "INVALID",
205
- reasoning: `Failed to get structured response: ${errorMessage}`,
206
- };
19
+ getLegacyBackend(methodName) {
20
+ if (this.backend === "legacy") {
21
+ return this.legacyEvaluator;
207
22
  }
23
+ throw new StagehandInvalidArgumentError(`V3Evaluator.${methodName}() was configured with ${EVALUATOR_BACKEND_ENV}=verifier, but the verifier backend is not available in this build. Use "legacy" or install the verifier backend PR.`);
24
+ }
25
+ }
26
+ function normalizeConstructorOptions(modelNameOrOptions, modelClientOptions, options) {
27
+ if (modelNameOrOptions &&
28
+ typeof modelNameOrOptions === "object" &&
29
+ !Array.isArray(modelNameOrOptions)) {
30
+ return {
31
+ modelName: modelNameOrOptions.modelName,
32
+ modelClientOptions: modelNameOrOptions.modelClientOptions,
33
+ backend: modelNameOrOptions.backend ?? options?.backend,
34
+ };
35
+ }
36
+ return {
37
+ modelName: modelNameOrOptions,
38
+ modelClientOptions,
39
+ backend: options?.backend,
40
+ };
41
+ }
42
+ function resolveEvaluatorBackend(explicitBackend) {
43
+ const configuredBackend = explicitBackend ??
44
+ process.env[EVALUATOR_BACKEND_ENV] ??
45
+ DEFAULT_EVALUATOR_BACKEND;
46
+ const normalizedBackend = configuredBackend.trim().toLowerCase();
47
+ if (normalizedBackend === "legacy" || normalizedBackend === "verifier") {
48
+ return normalizedBackend;
208
49
  }
50
+ throw new StagehandInvalidArgumentError(`Invalid ${EVALUATOR_BACKEND_ENV}="${configuredBackend}". Expected "legacy" or "verifier".`);
209
51
  }
210
52
  //# sourceMappingURL=v3Evaluator.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"v3Evaluator.js","sourceRoot":"","sources":["../../../lib/v3Evaluator.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAWxB,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,6BAA6B,EAAE,MAAM,gCAAgC,CAAC;AAE/E,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IAChC,UAAU,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IACjC,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE;CACtB,CAAC,CAAC;AAEH,MAAM,qBAAqB,GAAG,CAAC,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;AAExD,MAAM,OAAO,WAAW;IACd,EAAE,CAAK;IACP,SAAS,CAAiB;IAC1B,kBAAkB,CAAqC;IACvD,YAAY,GAA+B,GAAG,EAAE,GAAE,CAAC,CAAC;IAE5D,YACE,EAAM,EACN,SAA0B,EAC1B,kBAAkC;QAElC,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC;QACb,IAAI,CAAC,SAAS,GAAG,SAAS,IAAK,yBAA4C,CAAC;QAC5E,IAAI,CAAC,kBAAkB,GAAG,kBAAkB,IAAI;YAC9C,MAAM,EACJ,OAAO,CAAC,GAAG,CAAC,cAAc;gBAC1B,OAAO,CAAC,GAAG,CAAC,4BAA4B;gBACxC,EAAE;SACL,CAAC;IACJ,CAAC;IAEO,SAAS;QACf,sEAAsE;QACtE,MAAM,QAAQ,GAAG,IAAI,WAAW,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,CAAC,CAAC;QACjD,OAAO,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,kBAAkB,CAAC,CAAC;IACrE,CAAC;IAED,KAAK,CAAC,GAAG,CAAC,OAAwB;QAChC,MAAM,EACJ,QAAQ,EACR,MAAM,EACN,UAAU,GAAG,IAAI,EACjB,YAAY,EACZ,iBAAiB,GAAG,GAAG,EACvB,cAAc,GACf,GAAG,OAAO,CAAC;QACZ,IAAI,CAAC,QAAQ;YACX,MAAM,IAAI,6BAA6B,CACrC,oCAAoC,CACrC,CAAC;QACJ,IAAI,CAAC,MAAM,IAAI,CAAC,UAAU;YACxB,MAAM,IAAI,6BAA6B,CACrC,qDAAqD,CACtD,CAAC;QAEJ,IAAI,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAC,gCAAgC,CAAC;gBAC3C,QAAQ;gBACR,WAAW,EAAE,UAAU;gBACvB,YAAY;gBACZ,cAAc;aACf,CAAC,CAAC;QACL,CAAC;QAED,MAAM,mBAAmB,GAAG,kIAAkI,UAAU,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,sDAAsD,8HAA8H,IAAI,IAAI,EAAE,CAAC,kBAAkB,EAAE,EAAE,CAAC;QAElZ,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC,CAAC;QAC3D,IAAI,WAA+B,CAAC;QACpC,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;YACrD,WAAW,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,IAAI,mBAAmB,EAAE;oBAChE;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP;gCACE,IAAI,EAAE,MAAM;gCACZ,IAAI,EAAE,cAAc;oCAClB,CAAC,CAAC,aAAa,QAAQ,6CAA6C,cAAc,EAAE;oCACpF,CAAC,CAAC,QAAQ;6BACb;4BACD,GAAG,CAAC,UAAU,IAAI,WAAW;gCAC3B,CAAC,CAAC;oCACE;wCACE,IAAI,EAAE,WAAoB;wCAC1B,SAAS,EAAE;4CACT,GAAG,EAAE,0BAA0B,WAAW,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE;yCAChE;qCACF;iCACF;gCACH,CAAC,CAAC,EAAE,CAAC;4BACP,GAAG,CAAC,MAAM;gCACR,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,iBAAiB,MAAM,EAAE,EAAE,CAAC;gCAC9D,CAAC,CAAC,EAAE,CAAC;yBACR;qBACF;iBACF;gBACD,cAAc,EAAE,EAAE,IAAI,EAAE,kBAAkB,EAAE,MAAM,EAAE,gBAAgB,EAAE;aACvE;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,QAAQ,CAAC,IAEvB,CAAC;YACF,OAAO,EAAE,UAAU,EAAE,MAAM,CAAC,UAAU,EAAE,SAAS,EAAE,MAAM,CAAC,SAAS,EAAE,CAAC;QACxE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO;gBACL,UAAU,EAAE,SAAS;gBACrB,SAAS,EAAE,sCAAsC,YAAY,EAAE;aACvD,CAAC;QACb,CAAC;IACH,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,OAAwB;QACrC,MAAM,EACJ,SAAS,EACT,UAAU,GAAG,IAAI,EACjB,YAAY,GAAG,8EAA8E,EAC7F,iBAAiB,GAAG,GAAG,GACxB,GAAG,OAAO,CAAC;QACZ,IAAI,CAAC,SAAS,EAAE,MAAM;YACpB,MAAM,IAAI,6BAA6B,CACrC,iCAAiC,CAClC,CAAC;QAEJ,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC,CAAC;QAC3D,IAAI,WAA+B,CAAC;QACpC,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;YACrD,WAAW,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,SAAS,GAAG,SAAS;aACxB,GAAG,CACF,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CACV,GAAG,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,gBAAgB,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAClF;aACA,IAAI,CAAC,MAAM,CAAC,CAAC;QAEhB,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR;wBACE,IAAI,EAAE,QAAQ;wBACd,OAAO,EAAE,GAAG,YAAY,2CAA2C,UAAU,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,EAAE,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,6CAA6C,CAAC,CAAC,CAAC,EAAE,6KAA6K;qBAChX;oBACD;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE;4BACjC,GAAG,CAAC,UAAU,IAAI,WAAW;gCAC3B,CAAC,CAAC;oCACE;wCACE,IAAI,EAAE,WAAoB;wCAC1B,SAAS,EAAE;4CACT,GAAG,EAAE,0BAA0B,WAAW,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE;yCAChE;qCACF;iCACF;gCACH,CAAC,CAAC,EAAE,CAAC;yBACR;qBACF;iBACF;gBACD,cAAc,EAAE;oBACd,IAAI,EAAE,uBAAuB;oBAC7B,MAAM,EAAE,qBAAqB;iBAC9B;aACF;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,QAAQ,CAAC,IAExB,CAAC;YACF,OAAO,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBACzB,UAAU,EAAE,CAAC,CAAC,UAAU;gBACxB,SAAS,EAAE,CAAC,CAAC,SAAS;aACvB,CAAC,CAAC,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO,SAAS,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC;gBAC1B,UAAU,EAAE,SAAkB;gBAC9B,SAAS,EAAE,sCAAsC,YAAY,EAAE;aAChE,CAAC,CAAC,CAAC;QACN,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,gCAAgC,CAAC,OAK9C;QACC,MAAM,EACJ,QAAQ,EACR,WAAW,EACX,cAAc,EACd,YAAY,GAAG;UACX,cAAc,CAAC,CAAC,CAAC,iGAAiG,CAAC,CAAC,CAAC,EAAE;;;UAGvH,cAAc,CAAC,CAAC,CAAC,kNAAkN,CAAC,CAAC,CAAC,EAAE;0BACxN,IAAI,IAAI,EAAE,CAAC,kBAAkB,EAAE,EAAE,GACtD,GAAG,OAAO,CAAC;QAEZ,IAAI,CAAC,QAAQ;YACX,MAAM,IAAI,6BAA6B,CACrC,oCAAoC,CACrC,CAAC;QACJ,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC;YAC1C,MAAM,IAAI,6BAA6B,CACrC,0CAA0C,CAC3C,CAAC;QAEJ,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,aAAa,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YAC5C,IAAI,EAAE,WAAoB;YAC1B,SAAS,EAAE,EAAE,GAAG,EAAE,0BAA0B,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,EAAE;SACrE,CAAC,CAAC,CAAC;QAEJ,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,EAAE;oBACzC;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP;gCACE,IAAI,EAAE,MAAM;gCACZ,IAAI,EAAE,cAAc;oCAClB,CAAC,CAAC,aAAa,QAAQ,2DAA2D,cAAc,qBAAqB,WAAW,CAAC,MAAM,sKAAsK;oCAC7S,CAAC,CAAC,GAAG,QAAQ,qBAAqB,WAAW,CAAC,MAAM,mIAAmI;6BAC1L;4BACD,GAAG,aAAa;yBACjB;qBACF;iBACF;gBACD,cAAc,EAAE,EAAE,IAAI,EAAE,kBAAkB,EAAE,MAAM,EAAE,gBAAgB,EAAE;aACvE;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,QAAQ,CAAC,IAEvB,CAAC;YACF,OAAO,EAAE,UAAU,EAAE,MAAM,CAAC,UAAU,EAAE,SAAS,EAAE,MAAM,CAAC,SAAS,EAAE,CAAC;QACxE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO;gBACL,UAAU,EAAE,SAAS;gBACrB,SAAS,EAAE,sCAAsC,YAAY,EAAE;aACvD,CAAC;QACb,CAAC;IACH,CAAC;CACF","sourcesContent":["/**\n * V3Evaluator mirrors Evaluator but operates on a V3 instance instead of Stagehand.\n * It uses the V3 page/screenshot APIs and constructs an LLM client to run\n * structured evaluations (YES/NO with reasoning) on screenshots and/or text.\n */\n\nimport { z } from \"zod\";\nimport type { AvailableModel, ClientOptions } from \"./v3/types/public/model.js\";\nimport type {\n EvaluateOptions,\n BatchAskOptions,\n EvaluationResult,\n} from \"./v3/types/private/evaluator.js\";\nimport { LLMParsedResponse } from \"./inference.js\";\nimport { LLMResponse, LLMClient } from \"./v3/llm/LLMClient.js\";\nimport { LogLine } from \"./v3/types/public/logs.js\";\nimport { V3 } from \"./v3/v3.js\";\nimport { LLMProvider } from \"./v3/llm/LLMProvider.js\";\nimport { StagehandInvalidArgumentError } from \"./v3/types/public/sdkErrors.js\";\n\nconst EvaluationSchema = z.object({\n evaluation: z.enum([\"YES\", \"NO\"]),\n reasoning: z.string(),\n});\n\nconst BatchEvaluationSchema = z.array(EvaluationSchema);\n\nexport class V3Evaluator {\n private v3: V3;\n private modelName: AvailableModel;\n private modelClientOptions: ClientOptions | { apiKey: string };\n private silentLogger: (message: LogLine) => void = () => {};\n\n constructor(\n v3: V3,\n modelName?: AvailableModel,\n modelClientOptions?: ClientOptions,\n ) {\n this.v3 = v3;\n this.modelName = modelName || (\"google/gemini-2.5-flash\" as AvailableModel);\n this.modelClientOptions = modelClientOptions || {\n apiKey:\n process.env.GEMINI_API_KEY ||\n process.env.GOOGLE_GENERATIVE_AI_API_KEY ||\n \"\",\n };\n }\n\n private getClient(): LLMClient {\n // Prefer a dedicated provider so we can override model per-evaluation\n const provider = new LLMProvider(this.v3.logger);\n return provider.getClient(this.modelName, this.modelClientOptions);\n }\n\n async ask(options: EvaluateOptions): Promise<EvaluationResult> {\n const {\n question,\n answer,\n screenshot = true,\n systemPrompt,\n screenshotDelayMs = 250,\n agentReasoning,\n } = options;\n if (!question)\n throw new StagehandInvalidArgumentError(\n \"Question cannot be an empty string\",\n );\n if (!answer && !screenshot)\n throw new StagehandInvalidArgumentError(\n \"Either answer (text) or screenshot must be provided\",\n );\n\n if (Array.isArray(screenshot)) {\n return this._evaluateWithMultipleScreenshots({\n question,\n screenshots: screenshot,\n systemPrompt,\n agentReasoning,\n });\n }\n\n const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO based on if the original goal was achieved. You have access to ${screenshot ? \"a screenshot\" : \"the agents reasoning and actions throughout the task\"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.\\n Today's date is ${new Date().toLocaleDateString()}`;\n\n await new Promise((r) => setTimeout(r, screenshotDelayMs));\n let imageBuffer: Buffer | undefined;\n if (screenshot) {\n const page = await this.v3.context.awaitActivePage();\n imageBuffer = await page.screenshot({ fullPage: false });\n }\n\n const llmClient = this.getClient();\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n { role: \"system\", content: systemPrompt || defaultSystemPrompt },\n {\n role: \"user\",\n content: [\n {\n type: \"text\",\n text: agentReasoning\n ? `Question: ${question}\\n\\nAgent's reasoning and actions taken:\\n${agentReasoning}`\n : question,\n },\n ...(screenshot && imageBuffer\n ? [\n {\n type: \"image_url\" as const,\n image_url: {\n url: `data:image/jpeg;base64,${imageBuffer.toString(\"base64\")}`,\n },\n },\n ]\n : []),\n ...(answer\n ? [{ type: \"text\" as const, text: `the answer is ${answer}` }]\n : []),\n ],\n },\n ],\n response_model: { name: \"EvaluationResult\", schema: EvaluationSchema },\n },\n });\n\n try {\n const result = response.data as unknown as z.infer<\n typeof EvaluationSchema\n >;\n return { evaluation: result.evaluation, reasoning: result.reasoning };\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return {\n evaluation: \"INVALID\",\n reasoning: `Failed to get structured response: ${errorMessage}`,\n } as const;\n }\n }\n\n async batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]> {\n const {\n questions,\n screenshot = true,\n systemPrompt = \"You are an expert evaluator that returns YES or NO with a concise reasoning.\",\n screenshotDelayMs = 250,\n } = options;\n if (!questions?.length)\n throw new StagehandInvalidArgumentError(\n \"Questions array cannot be empty\",\n );\n\n await new Promise((r) => setTimeout(r, screenshotDelayMs));\n let imageBuffer: Buffer | undefined;\n if (screenshot) {\n const page = await this.v3.context.awaitActivePage();\n imageBuffer = await page.screenshot({ fullPage: false });\n }\n\n const llmClient = this.getClient();\n\n const formatted = questions\n .map(\n (item, i) =>\n `${i + 1}. ${item.question}${item.answer ? `\\n Answer: ${item.answer}` : \"\"}`,\n )\n .join(\"\\n\\n\");\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n {\n role: \"system\",\n content: `${systemPrompt}\\n\\nYou will be given multiple questions${screenshot ? \" with a screenshot\" : \"\"}. ${questions.some((q) => q.answer) ? \"Some questions include answers to evaluate.\" : \"\"} Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,\n },\n {\n role: \"user\",\n content: [\n { type: \"text\", text: formatted },\n ...(screenshot && imageBuffer\n ? [\n {\n type: \"image_url\" as const,\n image_url: {\n url: `data:image/jpeg;base64,${imageBuffer.toString(\"base64\")}`,\n },\n },\n ]\n : []),\n ],\n },\n ],\n response_model: {\n name: \"BatchEvaluationResult\",\n schema: BatchEvaluationSchema,\n },\n },\n });\n\n try {\n const results = response.data as unknown as z.infer<\n typeof BatchEvaluationSchema\n >;\n return results.map((r) => ({\n evaluation: r.evaluation,\n reasoning: r.reasoning,\n }));\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return questions.map(() => ({\n evaluation: \"INVALID\" as const,\n reasoning: `Failed to get structured response: ${errorMessage}`,\n }));\n }\n }\n\n private async _evaluateWithMultipleScreenshots(options: {\n question: string;\n screenshots: Buffer[];\n systemPrompt?: string;\n agentReasoning?: string;\n }): Promise<EvaluationResult> {\n const {\n question,\n screenshots,\n agentReasoning,\n systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.\n ${agentReasoning ? \"You also have access to the agent's detailed reasoning and thought process throughout the task.\" : \"\"}\n Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.\n Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).\n ${agentReasoning ? \"The agent's reasoning provides crucial context about what actions were attempted, what was observed, and the decision-making process. Use this alongside the visual evidence to make a comprehensive evaluation.\" : \"\"}\n Today's date is ${new Date().toLocaleDateString()}`,\n } = options;\n\n if (!question)\n throw new StagehandInvalidArgumentError(\n \"Question cannot be an empty string\",\n );\n if (!screenshots || screenshots.length === 0)\n throw new StagehandInvalidArgumentError(\n \"At least one screenshot must be provided\",\n );\n\n const llmClient = this.getClient();\n\n const imageContents = screenshots.map((s) => ({\n type: \"image_url\" as const,\n image_url: { url: `data:image/jpeg;base64,${s.toString(\"base64\")}` },\n }));\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n { role: \"system\", content: systemPrompt },\n {\n role: \"user\",\n content: [\n {\n type: \"text\",\n text: agentReasoning\n ? `Question: ${question}\\n\\nAgent's reasoning and actions throughout the task:\\n${agentReasoning}\\n\\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`\n : `${question}\\n\\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,\n },\n ...imageContents,\n ],\n },\n ],\n response_model: { name: \"EvaluationResult\", schema: EvaluationSchema },\n },\n });\n\n try {\n const result = response.data as unknown as z.infer<\n typeof EvaluationSchema\n >;\n return { evaluation: result.evaluation, reasoning: result.reasoning };\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return {\n evaluation: \"INVALID\",\n reasoning: `Failed to get structured response: ${errorMessage}`,\n } as const;\n }\n }\n}\n"]}
1
+ {"version":3,"file":"v3Evaluator.js","sourceRoot":"","sources":["../../../lib/v3Evaluator.ts"],"names":[],"mappings":"AAOA,OAAO,EAAE,6BAA6B,EAAE,MAAM,gCAAgC,CAAC;AAC/E,OAAO,EAAE,iBAAiB,EAAE,MAAM,wBAAwB,CAAC;AAE3D,MAAM,qBAAqB,GAAG,6BAA6B,CAAC;AAC5D,MAAM,yBAAyB,GAAuB,QAAQ,CAAC;AA2B/D,MAAM,OAAO,WAAW;IACL,OAAO,CAAqB;IAC5B,eAAe,CAAoB;IAEpD,YACE,EAAM,EACN,kBAAmE,EACnE,kBAAkC,EAClC,OAA4B;QAE5B,MAAM,iBAAiB,GAAG,2BAA2B,CACnD,kBAAkB,EAClB,kBAAkB,EAClB,OAAO,CACR,CAAC;QAEF,IAAI,CAAC,OAAO,GAAG,uBAAuB,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC;QAClE,IAAI,CAAC,eAAe,GAAG,IAAI,iBAAiB,CAC1C,EAAE,EACF,iBAAiB,CAAC,SAAS,EAC3B,iBAAiB,CAAC,kBAAkB,CACrC,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,GAAG,CAAC,OAAwB;QAChC,OAAO,IAAI,CAAC,gBAAgB,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IACnD,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,OAAwB;QACrC,OAAO,IAAI,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;IAC7D,CAAC;IAEO,gBAAgB,CAAC,UAAkB;QACzC,IAAI,IAAI,CAAC,OAAO,KAAK,QAAQ,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAC,eAAe,CAAC;QAC9B,CAAC;QAED,MAAM,IAAI,6BAA6B,CACrC,eAAe,UAAU,0BAA0B,qBAAqB,sHAAsH,CAC/L,CAAC;IACJ,CAAC;CACF;AAED,SAAS,2BAA2B,CAClC,kBAAmE,EACnE,kBAAkC,EAClC,OAA4B;IAE5B,IACE,kBAAkB;QAClB,OAAO,kBAAkB,KAAK,QAAQ;QACtC,CAAC,KAAK,CAAC,OAAO,CAAC,kBAAkB,CAAC,EAClC,CAAC;QACD,OAAO;YACL,SAAS,EAAE,kBAAkB,CAAC,SAAS;YACvC,kBAAkB,EAAE,kBAAkB,CAAC,kBAAkB;YACzD,OAAO,EAAE,kBAAkB,CAAC,OAAO,IAAI,OAAO,EAAE,OAAO;SACxD,CAAC;IACJ,CAAC;IAED,OAAO;QACL,SAAS,EAAE,kBAAgD;QAC3D,kBAAkB;QAClB,OAAO,EAAE,OAAO,EAAE,OAAO;KAC1B,CAAC;AACJ,CAAC;AAED,SAAS,uBAAuB,CAC9B,eAAoC;IAEpC,MAAM,iBAAiB,GACrB,eAAe;QACf,OAAO,CAAC,GAAG,CAAC,qBAAqB,CAAC;QAClC,yBAAyB,CAAC;IAC5B,MAAM,iBAAiB,GAAG,iBAAiB,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAEjE,IAAI,iBAAiB,KAAK,QAAQ,IAAI,iBAAiB,KAAK,UAAU,EAAE,CAAC;QACvE,OAAO,iBAAiB,CAAC;IAC3B,CAAC;IAED,MAAM,IAAI,6BAA6B,CACrC,WAAW,qBAAqB,KAAK,iBAAiB,qCAAqC,CAC5F,CAAC;AACJ,CAAC","sourcesContent":["import type { AvailableModel, ClientOptions } from \"./v3/types/public/model.js\";\nimport type {\n EvaluateOptions,\n BatchAskOptions,\n EvaluationResult,\n} from \"./v3/types/private/evaluator.js\";\nimport { V3 } from \"./v3/v3.js\";\nimport { StagehandInvalidArgumentError } from \"./v3/types/public/sdkErrors.js\";\nimport { LegacyV3Evaluator } from \"./v3LegacyEvaluator.js\";\n\nconst EVALUATOR_BACKEND_ENV = \"STAGEHAND_EVALUATOR_BACKEND\";\nconst DEFAULT_EVALUATOR_BACKEND: V3EvaluatorBackend = \"legacy\";\n\nexport type V3EvaluatorBackend = \"legacy\" | \"verifier\";\n\nexport type V3EvaluatorOptions = {\n /**\n * Selects the evaluator implementation.\n *\n * \"legacy\" preserves the existing screenshot/text YES/NO evaluator.\n * \"verifier\" is reserved for the rubric verifier backend.\n *\n * @default process.env.STAGEHAND_EVALUATOR_BACKEND || \"legacy\"\n */\n backend?: V3EvaluatorBackend;\n};\n\nexport type V3EvaluatorConstructorOptions = V3EvaluatorOptions & {\n modelName?: AvailableModel;\n modelClientOptions?: ClientOptions;\n};\n\ntype NormalizedConstructorOptions = {\n modelName?: AvailableModel;\n modelClientOptions?: ClientOptions;\n backend?: V3EvaluatorBackend;\n};\n\nexport class V3Evaluator {\n private readonly backend: V3EvaluatorBackend;\n private readonly legacyEvaluator: LegacyV3Evaluator;\n\n constructor(\n v3: V3,\n modelNameOrOptions?: AvailableModel | V3EvaluatorConstructorOptions,\n modelClientOptions?: ClientOptions,\n options?: V3EvaluatorOptions,\n ) {\n const normalizedOptions = normalizeConstructorOptions(\n modelNameOrOptions,\n modelClientOptions,\n options,\n );\n\n this.backend = resolveEvaluatorBackend(normalizedOptions.backend);\n this.legacyEvaluator = new LegacyV3Evaluator(\n v3,\n normalizedOptions.modelName,\n normalizedOptions.modelClientOptions,\n );\n }\n\n async ask(options: EvaluateOptions): Promise<EvaluationResult> {\n return this.getLegacyBackend(\"ask\").ask(options);\n }\n\n async batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]> {\n return this.getLegacyBackend(\"batchAsk\").batchAsk(options);\n }\n\n private getLegacyBackend(methodName: string): LegacyV3Evaluator {\n if (this.backend === \"legacy\") {\n return this.legacyEvaluator;\n }\n\n throw new StagehandInvalidArgumentError(\n `V3Evaluator.${methodName}() was configured with ${EVALUATOR_BACKEND_ENV}=verifier, but the verifier backend is not available in this build. Use \"legacy\" or install the verifier backend PR.`,\n );\n }\n}\n\nfunction normalizeConstructorOptions(\n modelNameOrOptions?: AvailableModel | V3EvaluatorConstructorOptions,\n modelClientOptions?: ClientOptions,\n options?: V3EvaluatorOptions,\n): NormalizedConstructorOptions {\n if (\n modelNameOrOptions &&\n typeof modelNameOrOptions === \"object\" &&\n !Array.isArray(modelNameOrOptions)\n ) {\n return {\n modelName: modelNameOrOptions.modelName,\n modelClientOptions: modelNameOrOptions.modelClientOptions,\n backend: modelNameOrOptions.backend ?? options?.backend,\n };\n }\n\n return {\n modelName: modelNameOrOptions as AvailableModel | undefined,\n modelClientOptions,\n backend: options?.backend,\n };\n}\n\nfunction resolveEvaluatorBackend(\n explicitBackend?: V3EvaluatorBackend,\n): V3EvaluatorBackend {\n const configuredBackend =\n explicitBackend ??\n process.env[EVALUATOR_BACKEND_ENV] ??\n DEFAULT_EVALUATOR_BACKEND;\n const normalizedBackend = configuredBackend.trim().toLowerCase();\n\n if (normalizedBackend === \"legacy\" || normalizedBackend === \"verifier\") {\n return normalizedBackend;\n }\n\n throw new StagehandInvalidArgumentError(\n `Invalid ${EVALUATOR_BACKEND_ENV}=\"${configuredBackend}\". Expected \"legacy\" or \"verifier\".`,\n );\n}\n"]}
@@ -0,0 +1,20 @@
1
+ /**
2
+ * Legacy V3 evaluator implementation.
3
+ *
4
+ * This is the behavior-preserving implementation that backs V3Evaluator when
5
+ * STAGEHAND_EVALUATOR_BACKEND=legacy.
6
+ */
7
+ import type { AvailableModel, ClientOptions } from "./v3/types/public/model.js";
8
+ import type { EvaluateOptions, BatchAskOptions, EvaluationResult } from "./v3/types/private/evaluator.js";
9
+ import { V3 } from "./v3/v3.js";
10
+ export declare class LegacyV3Evaluator {
11
+ private v3;
12
+ private modelName;
13
+ private modelClientOptions;
14
+ private silentLogger;
15
+ constructor(v3: V3, modelName?: AvailableModel, modelClientOptions?: ClientOptions);
16
+ private getClient;
17
+ ask(options: EvaluateOptions): Promise<EvaluationResult>;
18
+ batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]>;
19
+ private _evaluateWithMultipleScreenshots;
20
+ }
@@ -0,0 +1,211 @@
1
+ /**
2
+ * Legacy V3 evaluator implementation.
3
+ *
4
+ * This is the behavior-preserving implementation that backs V3Evaluator when
5
+ * STAGEHAND_EVALUATOR_BACKEND=legacy.
6
+ */
7
+ import { z } from "zod";
8
+ import { LLMProvider } from "./v3/llm/LLMProvider.js";
9
+ import { StagehandInvalidArgumentError } from "./v3/types/public/sdkErrors.js";
10
+ const EvaluationSchema = z.object({
11
+ evaluation: z.enum(["YES", "NO"]),
12
+ reasoning: z.string(),
13
+ });
14
+ const BatchEvaluationSchema = z.array(EvaluationSchema);
15
+ export class LegacyV3Evaluator {
16
+ v3;
17
+ modelName;
18
+ modelClientOptions;
19
+ silentLogger = () => { };
20
+ constructor(v3, modelName, modelClientOptions) {
21
+ this.v3 = v3;
22
+ this.modelName = modelName || "google/gemini-2.5-flash";
23
+ this.modelClientOptions = modelClientOptions || {
24
+ apiKey: process.env.GEMINI_API_KEY ||
25
+ process.env.GOOGLE_GENERATIVE_AI_API_KEY ||
26
+ "",
27
+ };
28
+ }
29
+ getClient() {
30
+ // Prefer a dedicated provider so we can override model per-evaluation
31
+ const provider = new LLMProvider(this.v3.logger);
32
+ return provider.getClient(this.modelName, this.modelClientOptions);
33
+ }
34
+ async ask(options) {
35
+ const { question, answer, screenshot = true, systemPrompt, screenshotDelayMs = 250, agentReasoning, } = options;
36
+ if (!question)
37
+ throw new StagehandInvalidArgumentError("Question cannot be an empty string");
38
+ if (!answer && !screenshot)
39
+ throw new StagehandInvalidArgumentError("Either answer (text) or screenshot must be provided");
40
+ if (Array.isArray(screenshot)) {
41
+ return this._evaluateWithMultipleScreenshots({
42
+ question,
43
+ screenshots: screenshot,
44
+ systemPrompt,
45
+ agentReasoning,
46
+ });
47
+ }
48
+ const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO based on if the original goal was achieved. You have access to ${screenshot ? "a screenshot" : "the agents reasoning and actions throughout the task"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.\n Today's date is ${new Date().toLocaleDateString()}`;
49
+ await new Promise((r) => setTimeout(r, screenshotDelayMs));
50
+ let imageBuffer;
51
+ if (screenshot) {
52
+ const page = await this.v3.context.awaitActivePage();
53
+ imageBuffer = await page.screenshot({ fullPage: false });
54
+ }
55
+ const llmClient = this.getClient();
56
+ const response = await llmClient.createChatCompletion({
57
+ logger: this.silentLogger,
58
+ options: {
59
+ messages: [
60
+ { role: "system", content: systemPrompt || defaultSystemPrompt },
61
+ {
62
+ role: "user",
63
+ content: [
64
+ {
65
+ type: "text",
66
+ text: agentReasoning
67
+ ? `Question: ${question}\n\nAgent's reasoning and actions taken:\n${agentReasoning}`
68
+ : question,
69
+ },
70
+ ...(screenshot && imageBuffer
71
+ ? [
72
+ {
73
+ type: "image_url",
74
+ image_url: {
75
+ url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
76
+ },
77
+ },
78
+ ]
79
+ : []),
80
+ ...(answer
81
+ ? [{ type: "text", text: `the answer is ${answer}` }]
82
+ : []),
83
+ ],
84
+ },
85
+ ],
86
+ response_model: { name: "EvaluationResult", schema: EvaluationSchema },
87
+ },
88
+ });
89
+ try {
90
+ const result = response.data;
91
+ return { evaluation: result.evaluation, reasoning: result.reasoning };
92
+ }
93
+ catch (error) {
94
+ const errorMessage = error instanceof Error ? error.message : String(error);
95
+ return {
96
+ evaluation: "INVALID",
97
+ reasoning: `Failed to get structured response: ${errorMessage}`,
98
+ };
99
+ }
100
+ }
101
+ async batchAsk(options) {
102
+ const { questions, screenshot = true, systemPrompt = "You are an expert evaluator that returns YES or NO with a concise reasoning.", screenshotDelayMs = 250, } = options;
103
+ if (!questions?.length)
104
+ throw new StagehandInvalidArgumentError("Questions array cannot be empty");
105
+ await new Promise((r) => setTimeout(r, screenshotDelayMs));
106
+ let imageBuffer;
107
+ if (screenshot) {
108
+ const page = await this.v3.context.awaitActivePage();
109
+ imageBuffer = await page.screenshot({ fullPage: false });
110
+ }
111
+ const llmClient = this.getClient();
112
+ const formatted = questions
113
+ .map((item, i) => `${i + 1}. ${item.question}${item.answer ? `\n Answer: ${item.answer}` : ""}`)
114
+ .join("\n\n");
115
+ const response = await llmClient.createChatCompletion({
116
+ logger: this.silentLogger,
117
+ options: {
118
+ messages: [
119
+ {
120
+ role: "system",
121
+ content: `${systemPrompt}\n\nYou will be given multiple questions${screenshot ? " with a screenshot" : ""}. ${questions.some((q) => q.answer) ? "Some questions include answers to evaluate." : ""} Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,
122
+ },
123
+ {
124
+ role: "user",
125
+ content: [
126
+ { type: "text", text: formatted },
127
+ ...(screenshot && imageBuffer
128
+ ? [
129
+ {
130
+ type: "image_url",
131
+ image_url: {
132
+ url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
133
+ },
134
+ },
135
+ ]
136
+ : []),
137
+ ],
138
+ },
139
+ ],
140
+ response_model: {
141
+ name: "BatchEvaluationResult",
142
+ schema: BatchEvaluationSchema,
143
+ },
144
+ },
145
+ });
146
+ try {
147
+ const results = response.data;
148
+ return results.map((r) => ({
149
+ evaluation: r.evaluation,
150
+ reasoning: r.reasoning,
151
+ }));
152
+ }
153
+ catch (error) {
154
+ const errorMessage = error instanceof Error ? error.message : String(error);
155
+ return questions.map(() => ({
156
+ evaluation: "INVALID",
157
+ reasoning: `Failed to get structured response: ${errorMessage}`,
158
+ }));
159
+ }
160
+ }
161
+ async _evaluateWithMultipleScreenshots(options) {
162
+ const { question, screenshots, agentReasoning, systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
163
+ ${agentReasoning ? "You also have access to the agent's detailed reasoning and thought process throughout the task." : ""}
164
+ Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.
165
+ Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).
166
+ ${agentReasoning ? "The agent's reasoning provides crucial context about what actions were attempted, what was observed, and the decision-making process. Use this alongside the visual evidence to make a comprehensive evaluation." : ""}
167
+ Today's date is ${new Date().toLocaleDateString()}`, } = options;
168
+ if (!question)
169
+ throw new StagehandInvalidArgumentError("Question cannot be an empty string");
170
+ if (!screenshots || screenshots.length === 0)
171
+ throw new StagehandInvalidArgumentError("At least one screenshot must be provided");
172
+ const llmClient = this.getClient();
173
+ const imageContents = screenshots.map((s) => ({
174
+ type: "image_url",
175
+ image_url: { url: `data:image/jpeg;base64,${s.toString("base64")}` },
176
+ }));
177
+ const response = await llmClient.createChatCompletion({
178
+ logger: this.silentLogger,
179
+ options: {
180
+ messages: [
181
+ { role: "system", content: systemPrompt },
182
+ {
183
+ role: "user",
184
+ content: [
185
+ {
186
+ type: "text",
187
+ text: agentReasoning
188
+ ? `Question: ${question}\n\nAgent's reasoning and actions throughout the task:\n${agentReasoning}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`
189
+ : `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
190
+ },
191
+ ...imageContents,
192
+ ],
193
+ },
194
+ ],
195
+ response_model: { name: "EvaluationResult", schema: EvaluationSchema },
196
+ },
197
+ });
198
+ try {
199
+ const result = response.data;
200
+ return { evaluation: result.evaluation, reasoning: result.reasoning };
201
+ }
202
+ catch (error) {
203
+ const errorMessage = error instanceof Error ? error.message : String(error);
204
+ return {
205
+ evaluation: "INVALID",
206
+ reasoning: `Failed to get structured response: ${errorMessage}`,
207
+ };
208
+ }
209
+ }
210
+ }
211
+ //# sourceMappingURL=v3LegacyEvaluator.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"v3LegacyEvaluator.js","sourceRoot":"","sources":["../../../lib/v3LegacyEvaluator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAWxB,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,EAAE,6BAA6B,EAAE,MAAM,gCAAgC,CAAC;AAE/E,MAAM,gBAAgB,GAAG,CAAC,CAAC,MAAM,CAAC;IAChC,UAAU,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IACjC,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE;CACtB,CAAC,CAAC;AAEH,MAAM,qBAAqB,GAAG,CAAC,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;AAExD,MAAM,OAAO,iBAAiB;IACpB,EAAE,CAAK;IACP,SAAS,CAAiB;IAC1B,kBAAkB,CAAqC;IACvD,YAAY,GAA+B,GAAG,EAAE,GAAE,CAAC,CAAC;IAE5D,YACE,EAAM,EACN,SAA0B,EAC1B,kBAAkC;QAElC,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC;QACb,IAAI,CAAC,SAAS,GAAG,SAAS,IAAK,yBAA4C,CAAC;QAC5E,IAAI,CAAC,kBAAkB,GAAG,kBAAkB,IAAI;YAC9C,MAAM,EACJ,OAAO,CAAC,GAAG,CAAC,cAAc;gBAC1B,OAAO,CAAC,GAAG,CAAC,4BAA4B;gBACxC,EAAE;SACL,CAAC;IACJ,CAAC;IAEO,SAAS;QACf,sEAAsE;QACtE,MAAM,QAAQ,GAAG,IAAI,WAAW,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,CAAC,CAAC;QACjD,OAAO,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,kBAAkB,CAAC,CAAC;IACrE,CAAC;IAED,KAAK,CAAC,GAAG,CAAC,OAAwB;QAChC,MAAM,EACJ,QAAQ,EACR,MAAM,EACN,UAAU,GAAG,IAAI,EACjB,YAAY,EACZ,iBAAiB,GAAG,GAAG,EACvB,cAAc,GACf,GAAG,OAAO,CAAC;QACZ,IAAI,CAAC,QAAQ;YACX,MAAM,IAAI,6BAA6B,CACrC,oCAAoC,CACrC,CAAC;QACJ,IAAI,CAAC,MAAM,IAAI,CAAC,UAAU;YACxB,MAAM,IAAI,6BAA6B,CACrC,qDAAqD,CACtD,CAAC;QAEJ,IAAI,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAC,gCAAgC,CAAC;gBAC3C,QAAQ;gBACR,WAAW,EAAE,UAAU;gBACvB,YAAY;gBACZ,cAAc;aACf,CAAC,CAAC;QACL,CAAC;QAED,MAAM,mBAAmB,GAAG,kIAAkI,UAAU,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,sDAAsD,8HAA8H,IAAI,IAAI,EAAE,CAAC,kBAAkB,EAAE,EAAE,CAAC;QAElZ,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC,CAAC;QAC3D,IAAI,WAA+B,CAAC;QACpC,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;YACrD,WAAW,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,IAAI,mBAAmB,EAAE;oBAChE;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP;gCACE,IAAI,EAAE,MAAM;gCACZ,IAAI,EAAE,cAAc;oCAClB,CAAC,CAAC,aAAa,QAAQ,6CAA6C,cAAc,EAAE;oCACpF,CAAC,CAAC,QAAQ;6BACb;4BACD,GAAG,CAAC,UAAU,IAAI,WAAW;gCAC3B,CAAC,CAAC;oCACE;wCACE,IAAI,EAAE,WAAoB;wCAC1B,SAAS,EAAE;4CACT,GAAG,EAAE,0BAA0B,WAAW,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE;yCAChE;qCACF;iCACF;gCACH,CAAC,CAAC,EAAE,CAAC;4BACP,GAAG,CAAC,MAAM;gCACR,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,iBAAiB,MAAM,EAAE,EAAE,CAAC;gCAC9D,CAAC,CAAC,EAAE,CAAC;yBACR;qBACF;iBACF;gBACD,cAAc,EAAE,EAAE,IAAI,EAAE,kBAAkB,EAAE,MAAM,EAAE,gBAAgB,EAAE;aACvE;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,QAAQ,CAAC,IAEvB,CAAC;YACF,OAAO,EAAE,UAAU,EAAE,MAAM,CAAC,UAAU,EAAE,SAAS,EAAE,MAAM,CAAC,SAAS,EAAE,CAAC;QACxE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO;gBACL,UAAU,EAAE,SAAS;gBACrB,SAAS,EAAE,sCAAsC,YAAY,EAAE;aACvD,CAAC;QACb,CAAC;IACH,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,OAAwB;QACrC,MAAM,EACJ,SAAS,EACT,UAAU,GAAG,IAAI,EACjB,YAAY,GAAG,8EAA8E,EAC7F,iBAAiB,GAAG,GAAG,GACxB,GAAG,OAAO,CAAC;QACZ,IAAI,CAAC,SAAS,EAAE,MAAM;YACpB,MAAM,IAAI,6BAA6B,CACrC,iCAAiC,CAClC,CAAC;QAEJ,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC,CAAC;QAC3D,IAAI,WAA+B,CAAC;QACpC,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;YACrD,WAAW,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,SAAS,GAAG,SAAS;aACxB,GAAG,CACF,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CACV,GAAG,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,gBAAgB,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAClF;aACA,IAAI,CAAC,MAAM,CAAC,CAAC;QAEhB,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR;wBACE,IAAI,EAAE,QAAQ;wBACd,OAAO,EAAE,GAAG,YAAY,2CAA2C,UAAU,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,EAAE,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,6CAA6C,CAAC,CAAC,CAAC,EAAE,6KAA6K;qBAChX;oBACD;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE;4BACjC,GAAG,CAAC,UAAU,IAAI,WAAW;gCAC3B,CAAC,CAAC;oCACE;wCACE,IAAI,EAAE,WAAoB;wCAC1B,SAAS,EAAE;4CACT,GAAG,EAAE,0BAA0B,WAAW,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE;yCAChE;qCACF;iCACF;gCACH,CAAC,CAAC,EAAE,CAAC;yBACR;qBACF;iBACF;gBACD,cAAc,EAAE;oBACd,IAAI,EAAE,uBAAuB;oBAC7B,MAAM,EAAE,qBAAqB;iBAC9B;aACF;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,QAAQ,CAAC,IAExB,CAAC;YACF,OAAO,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBACzB,UAAU,EAAE,CAAC,CAAC,UAAU;gBACxB,SAAS,EAAE,CAAC,CAAC,SAAS;aACvB,CAAC,CAAC,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO,SAAS,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC;gBAC1B,UAAU,EAAE,SAAkB;gBAC9B,SAAS,EAAE,sCAAsC,YAAY,EAAE;aAChE,CAAC,CAAC,CAAC;QACN,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,gCAAgC,CAAC,OAK9C;QACC,MAAM,EACJ,QAAQ,EACR,WAAW,EACX,cAAc,EACd,YAAY,GAAG;UACX,cAAc,CAAC,CAAC,CAAC,iGAAiG,CAAC,CAAC,CAAC,EAAE;;;UAGvH,cAAc,CAAC,CAAC,CAAC,kNAAkN,CAAC,CAAC,CAAC,EAAE;0BACxN,IAAI,IAAI,EAAE,CAAC,kBAAkB,EAAE,EAAE,GACtD,GAAG,OAAO,CAAC;QAEZ,IAAI,CAAC,QAAQ;YACX,MAAM,IAAI,6BAA6B,CACrC,oCAAoC,CACrC,CAAC;QACJ,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC;YAC1C,MAAM,IAAI,6BAA6B,CACrC,0CAA0C,CAC3C,CAAC;QAEJ,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,aAAa,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YAC5C,IAAI,EAAE,WAAoB;YAC1B,SAAS,EAAE,EAAE,GAAG,EAAE,0BAA0B,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,EAAE;SACrE,CAAC,CAAC,CAAC;QAEJ,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,EAAE;oBACzC;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP;gCACE,IAAI,EAAE,MAAM;gCACZ,IAAI,EAAE,cAAc;oCAClB,CAAC,CAAC,aAAa,QAAQ,2DAA2D,cAAc,qBAAqB,WAAW,CAAC,MAAM,sKAAsK;oCAC7S,CAAC,CAAC,GAAG,QAAQ,qBAAqB,WAAW,CAAC,MAAM,mIAAmI;6BAC1L;4BACD,GAAG,aAAa;yBACjB;qBACF;iBACF;gBACD,cAAc,EAAE,EAAE,IAAI,EAAE,kBAAkB,EAAE,MAAM,EAAE,gBAAgB,EAAE;aACvE;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,QAAQ,CAAC,IAEvB,CAAC;YACF,OAAO,EAAE,UAAU,EAAE,MAAM,CAAC,UAAU,EAAE,SAAS,EAAE,MAAM,CAAC,SAAS,EAAE,CAAC;QACxE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO;gBACL,UAAU,EAAE,SAAS;gBACrB,SAAS,EAAE,sCAAsC,YAAY,EAAE;aACvD,CAAC;QACb,CAAC;IACH,CAAC;CACF","sourcesContent":["/**\n * Legacy V3 evaluator implementation.\n *\n * This is the behavior-preserving implementation that backs V3Evaluator when\n * STAGEHAND_EVALUATOR_BACKEND=legacy.\n */\n\nimport { z } from \"zod\";\nimport type { AvailableModel, ClientOptions } from \"./v3/types/public/model.js\";\nimport type {\n EvaluateOptions,\n BatchAskOptions,\n EvaluationResult,\n} from \"./v3/types/private/evaluator.js\";\nimport { LLMParsedResponse } from \"./inference.js\";\nimport { LLMResponse, LLMClient } from \"./v3/llm/LLMClient.js\";\nimport { LogLine } from \"./v3/types/public/logs.js\";\nimport { V3 } from \"./v3/v3.js\";\nimport { LLMProvider } from \"./v3/llm/LLMProvider.js\";\nimport { StagehandInvalidArgumentError } from \"./v3/types/public/sdkErrors.js\";\n\nconst EvaluationSchema = z.object({\n evaluation: z.enum([\"YES\", \"NO\"]),\n reasoning: z.string(),\n});\n\nconst BatchEvaluationSchema = z.array(EvaluationSchema);\n\nexport class LegacyV3Evaluator {\n private v3: V3;\n private modelName: AvailableModel;\n private modelClientOptions: ClientOptions | { apiKey: string };\n private silentLogger: (message: LogLine) => void = () => {};\n\n constructor(\n v3: V3,\n modelName?: AvailableModel,\n modelClientOptions?: ClientOptions,\n ) {\n this.v3 = v3;\n this.modelName = modelName || (\"google/gemini-2.5-flash\" as AvailableModel);\n this.modelClientOptions = modelClientOptions || {\n apiKey:\n process.env.GEMINI_API_KEY ||\n process.env.GOOGLE_GENERATIVE_AI_API_KEY ||\n \"\",\n };\n }\n\n private getClient(): LLMClient {\n // Prefer a dedicated provider so we can override model per-evaluation\n const provider = new LLMProvider(this.v3.logger);\n return provider.getClient(this.modelName, this.modelClientOptions);\n }\n\n async ask(options: EvaluateOptions): Promise<EvaluationResult> {\n const {\n question,\n answer,\n screenshot = true,\n systemPrompt,\n screenshotDelayMs = 250,\n agentReasoning,\n } = options;\n if (!question)\n throw new StagehandInvalidArgumentError(\n \"Question cannot be an empty string\",\n );\n if (!answer && !screenshot)\n throw new StagehandInvalidArgumentError(\n \"Either answer (text) or screenshot must be provided\",\n );\n\n if (Array.isArray(screenshot)) {\n return this._evaluateWithMultipleScreenshots({\n question,\n screenshots: screenshot,\n systemPrompt,\n agentReasoning,\n });\n }\n\n const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO based on if the original goal was achieved. You have access to ${screenshot ? \"a screenshot\" : \"the agents reasoning and actions throughout the task\"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.\\n Today's date is ${new Date().toLocaleDateString()}`;\n\n await new Promise((r) => setTimeout(r, screenshotDelayMs));\n let imageBuffer: Buffer | undefined;\n if (screenshot) {\n const page = await this.v3.context.awaitActivePage();\n imageBuffer = await page.screenshot({ fullPage: false });\n }\n\n const llmClient = this.getClient();\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n { role: \"system\", content: systemPrompt || defaultSystemPrompt },\n {\n role: \"user\",\n content: [\n {\n type: \"text\",\n text: agentReasoning\n ? `Question: ${question}\\n\\nAgent's reasoning and actions taken:\\n${agentReasoning}`\n : question,\n },\n ...(screenshot && imageBuffer\n ? [\n {\n type: \"image_url\" as const,\n image_url: {\n url: `data:image/jpeg;base64,${imageBuffer.toString(\"base64\")}`,\n },\n },\n ]\n : []),\n ...(answer\n ? [{ type: \"text\" as const, text: `the answer is ${answer}` }]\n : []),\n ],\n },\n ],\n response_model: { name: \"EvaluationResult\", schema: EvaluationSchema },\n },\n });\n\n try {\n const result = response.data as unknown as z.infer<\n typeof EvaluationSchema\n >;\n return { evaluation: result.evaluation, reasoning: result.reasoning };\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return {\n evaluation: \"INVALID\",\n reasoning: `Failed to get structured response: ${errorMessage}`,\n } as const;\n }\n }\n\n async batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]> {\n const {\n questions,\n screenshot = true,\n systemPrompt = \"You are an expert evaluator that returns YES or NO with a concise reasoning.\",\n screenshotDelayMs = 250,\n } = options;\n if (!questions?.length)\n throw new StagehandInvalidArgumentError(\n \"Questions array cannot be empty\",\n );\n\n await new Promise((r) => setTimeout(r, screenshotDelayMs));\n let imageBuffer: Buffer | undefined;\n if (screenshot) {\n const page = await this.v3.context.awaitActivePage();\n imageBuffer = await page.screenshot({ fullPage: false });\n }\n\n const llmClient = this.getClient();\n\n const formatted = questions\n .map(\n (item, i) =>\n `${i + 1}. ${item.question}${item.answer ? `\\n Answer: ${item.answer}` : \"\"}`,\n )\n .join(\"\\n\\n\");\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n {\n role: \"system\",\n content: `${systemPrompt}\\n\\nYou will be given multiple questions${screenshot ? \" with a screenshot\" : \"\"}. ${questions.some((q) => q.answer) ? \"Some questions include answers to evaluate.\" : \"\"} Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,\n },\n {\n role: \"user\",\n content: [\n { type: \"text\", text: formatted },\n ...(screenshot && imageBuffer\n ? [\n {\n type: \"image_url\" as const,\n image_url: {\n url: `data:image/jpeg;base64,${imageBuffer.toString(\"base64\")}`,\n },\n },\n ]\n : []),\n ],\n },\n ],\n response_model: {\n name: \"BatchEvaluationResult\",\n schema: BatchEvaluationSchema,\n },\n },\n });\n\n try {\n const results = response.data as unknown as z.infer<\n typeof BatchEvaluationSchema\n >;\n return results.map((r) => ({\n evaluation: r.evaluation,\n reasoning: r.reasoning,\n }));\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return questions.map(() => ({\n evaluation: \"INVALID\" as const,\n reasoning: `Failed to get structured response: ${errorMessage}`,\n }));\n }\n }\n\n private async _evaluateWithMultipleScreenshots(options: {\n question: string;\n screenshots: Buffer[];\n systemPrompt?: string;\n agentReasoning?: string;\n }): Promise<EvaluationResult> {\n const {\n question,\n screenshots,\n agentReasoning,\n systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.\n ${agentReasoning ? \"You also have access to the agent's detailed reasoning and thought process throughout the task.\" : \"\"}\n Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.\n Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).\n ${agentReasoning ? \"The agent's reasoning provides crucial context about what actions were attempted, what was observed, and the decision-making process. Use this alongside the visual evidence to make a comprehensive evaluation.\" : \"\"}\n Today's date is ${new Date().toLocaleDateString()}`,\n } = options;\n\n if (!question)\n throw new StagehandInvalidArgumentError(\n \"Question cannot be an empty string\",\n );\n if (!screenshots || screenshots.length === 0)\n throw new StagehandInvalidArgumentError(\n \"At least one screenshot must be provided\",\n );\n\n const llmClient = this.getClient();\n\n const imageContents = screenshots.map((s) => ({\n type: \"image_url\" as const,\n image_url: { url: `data:image/jpeg;base64,${s.toString(\"base64\")}` },\n }));\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n { role: \"system\", content: systemPrompt },\n {\n role: \"user\",\n content: [\n {\n type: \"text\",\n text: agentReasoning\n ? `Question: ${question}\\n\\nAgent's reasoning and actions throughout the task:\\n${agentReasoning}\\n\\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`\n : `${question}\\n\\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,\n },\n ...imageContents,\n ],\n },\n ],\n response_model: { name: \"EvaluationResult\", schema: EvaluationSchema },\n },\n });\n\n try {\n const result = response.data as unknown as z.infer<\n typeof EvaluationSchema\n >;\n return { evaluation: result.evaluation, reasoning: result.reasoning };\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return {\n evaluation: \"INVALID\",\n reasoning: `Failed to get structured response: ${errorMessage}`,\n } as const;\n }\n }\n}\n"]}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@browserbasehq/orca",
3
- "version": "3.4.0-preview-5",
3
+ "version": "3.5.0-preview.0",
4
4
  "description": "An AI web browsing framework focused on simplicity and extensibility.",
5
5
  "type": "module",
6
6
  "main": "./dist/cjs/index.js",
@@ -35,9 +35,9 @@
35
35
  "build-dom-scripts": "tsx scripts/build-dom-scripts.ts",
36
36
  "build:cjs": "tsx scripts/build-cjs.ts",
37
37
  "build:esm": "tsx scripts/build-esm.ts",
38
- "build": "pnpm --filter @browserbasehq/stagehand run --parallel \"/^build:(esm|cjs)$/\"",
38
+ "build": "pnpm --filter @browserbasehq/orca run --parallel \"/^build:(esm|cjs)$/\"",
39
39
  "example": "node --import tsx -e \"const args=process.argv.slice(1).filter(a=>a!=='--'); const [p]=args; const n=(p||'example').replace(/^\\.\\//,'').replace(/\\.ts$/i,''); import('node:path').then(path=>import(new URL(path.resolve('examples', n + '.ts'), 'file:')));\" --",
40
- "test": "pnpm -w --dir ../.. exec turbo run test:core test:e2e --filter=@browserbasehq/stagehand --",
40
+ "test": "pnpm -w --dir ../.. exec turbo run test:core test:e2e --filter=@browserbasehq/orca --",
41
41
  "test:core": "tsx scripts/test-core.ts",
42
42
  "test:e2e": "tsx scripts/test-e2e.ts",
43
43
  "format": "prettier --write .",