@browserbasehq/orca 3.4.0-preview-5 → 3.5.0-preview.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/dist/cjs/lib/inference.d.ts +2 -1
  2. package/dist/cjs/lib/inference.js +10 -3
  3. package/dist/cjs/lib/inference.js.map +1 -1
  4. package/dist/cjs/lib/prompt.d.ts +2 -2
  5. package/dist/cjs/lib/prompt.js +19 -4
  6. package/dist/cjs/lib/prompt.js.map +1 -1
  7. package/dist/cjs/lib/v3/handlers/extractHandler.js +19 -2
  8. package/dist/cjs/lib/v3/handlers/extractHandler.js.map +1 -1
  9. package/dist/cjs/lib/v3/index.d.ts +1 -0
  10. package/dist/cjs/lib/v3/index.js.map +1 -1
  11. package/dist/cjs/lib/v3/types/private/handlers.d.ts +1 -0
  12. package/dist/cjs/lib/v3/types/private/handlers.js.map +1 -1
  13. package/dist/cjs/lib/v3/types/public/api.d.ts +2 -0
  14. package/dist/cjs/lib/v3/types/public/api.js +4 -0
  15. package/dist/cjs/lib/v3/types/public/api.js.map +1 -1
  16. package/dist/cjs/lib/v3/types/public/methods.d.ts +1 -0
  17. package/dist/cjs/lib/v3/types/public/methods.js.map +1 -1
  18. package/dist/cjs/lib/v3/v3.js +2 -0
  19. package/dist/cjs/lib/v3/v3.js.map +1 -1
  20. package/dist/cjs/lib/v3Evaluator.d.ts +20 -12
  21. package/dist/cjs/lib/v3Evaluator.js +41 -199
  22. package/dist/cjs/lib/v3Evaluator.js.map +1 -1
  23. package/dist/cjs/lib/v3LegacyEvaluator.d.ts +20 -0
  24. package/dist/cjs/lib/v3LegacyEvaluator.js +215 -0
  25. package/dist/cjs/lib/v3LegacyEvaluator.js.map +1 -0
  26. package/dist/esm/lib/inference.d.ts +2 -1
  27. package/dist/esm/lib/inference.js +10 -3
  28. package/dist/esm/lib/inference.js.map +1 -1
  29. package/dist/esm/lib/prompt.d.ts +2 -2
  30. package/dist/esm/lib/prompt.js +19 -4
  31. package/dist/esm/lib/prompt.js.map +1 -1
  32. package/dist/esm/lib/v3/handlers/extractHandler.js +19 -2
  33. package/dist/esm/lib/v3/handlers/extractHandler.js.map +1 -1
  34. package/dist/esm/lib/v3/index.d.ts +1 -0
  35. package/dist/esm/lib/v3/index.js.map +1 -1
  36. package/dist/esm/lib/v3/types/private/handlers.d.ts +1 -0
  37. package/dist/esm/lib/v3/types/private/handlers.js.map +1 -1
  38. package/dist/esm/lib/v3/types/public/api.d.ts +2 -0
  39. package/dist/esm/lib/v3/types/public/api.js +4 -0
  40. package/dist/esm/lib/v3/types/public/api.js.map +1 -1
  41. package/dist/esm/lib/v3/types/public/methods.d.ts +1 -0
  42. package/dist/esm/lib/v3/types/public/methods.js.map +1 -1
  43. package/dist/esm/lib/v3/v3.js +2 -0
  44. package/dist/esm/lib/v3/v3.js.map +1 -1
  45. package/dist/esm/lib/v3Evaluator.d.ts +20 -12
  46. package/dist/esm/lib/v3Evaluator.js +41 -199
  47. package/dist/esm/lib/v3Evaluator.js.map +1 -1
  48. package/dist/esm/lib/v3LegacyEvaluator.d.ts +20 -0
  49. package/dist/esm/lib/v3LegacyEvaluator.js +211 -0
  50. package/dist/esm/lib/v3LegacyEvaluator.js.map +1 -0
  51. package/package.json +3 -3
  52. package/dist/cjs/lib/v3/dom/build/selectorRuntime.generated.d.ts +0 -24
  53. package/dist/cjs/lib/v3/dom/build/selectorRuntime.generated.js +0 -31
  54. package/dist/cjs/lib/v3/dom/build/selectorRuntime.generated.js.map +0 -1
  55. package/dist/esm/lib/v3/dom/build/selectorRuntime.generated.d.ts +0 -24
  56. package/dist/esm/lib/v3/dom/build/selectorRuntime.generated.js +0 -28
  57. package/dist/esm/lib/v3/dom/build/selectorRuntime.generated.js.map +0 -1
@@ -1,19 +1,27 @@
1
- /**
2
- * V3Evaluator mirrors Evaluator but operates on a V3 instance instead of Stagehand.
3
- * It uses the V3 page/screenshot APIs and constructs an LLM client to run
4
- * structured evaluations (YES/NO with reasoning) on screenshots and/or text.
5
- */
6
1
  import type { AvailableModel, ClientOptions } from "./v3/types/public/model.js";
7
2
  import type { EvaluateOptions, BatchAskOptions, EvaluationResult } from "./v3/types/private/evaluator.js";
8
3
  import { V3 } from "./v3/v3.js";
4
+ export type V3EvaluatorBackend = "legacy" | "verifier";
5
+ export type V3EvaluatorOptions = {
6
+ /**
7
+ * Selects the evaluator implementation.
8
+ *
9
+ * "legacy" preserves the existing screenshot/text YES/NO evaluator.
10
+ * "verifier" is reserved for the rubric verifier backend.
11
+ *
12
+ * @default process.env.STAGEHAND_EVALUATOR_BACKEND || "legacy"
13
+ */
14
+ backend?: V3EvaluatorBackend;
15
+ };
16
+ export type V3EvaluatorConstructorOptions = V3EvaluatorOptions & {
17
+ modelName?: AvailableModel;
18
+ modelClientOptions?: ClientOptions;
19
+ };
9
20
  export declare class V3Evaluator {
10
- private v3;
11
- private modelName;
12
- private modelClientOptions;
13
- private silentLogger;
14
- constructor(v3: V3, modelName?: AvailableModel, modelClientOptions?: ClientOptions);
15
- private getClient;
21
+ private readonly backend;
22
+ private readonly legacyEvaluator;
23
+ constructor(v3: V3, modelNameOrOptions?: AvailableModel | V3EvaluatorConstructorOptions, modelClientOptions?: ClientOptions, options?: V3EvaluatorOptions);
16
24
  ask(options: EvaluateOptions): Promise<EvaluationResult>;
17
25
  batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]>;
18
- private _evaluateWithMultipleScreenshots;
26
+ private getLegacyBackend;
19
27
  }
@@ -1,214 +1,56 @@
1
1
  "use strict";
2
- /**
3
- * V3Evaluator mirrors Evaluator but operates on a V3 instance instead of Stagehand.
4
- * It uses the V3 page/screenshot APIs and constructs an LLM client to run
5
- * structured evaluations (YES/NO with reasoning) on screenshots and/or text.
6
- */
7
2
  Object.defineProperty(exports, "__esModule", { value: true });
8
3
  exports.V3Evaluator = void 0;
9
- const zod_1 = require("zod");
10
- const LLMProvider_js_1 = require("./v3/llm/LLMProvider.js");
11
4
  const sdkErrors_js_1 = require("./v3/types/public/sdkErrors.js");
12
- const EvaluationSchema = zod_1.z.object({
13
- evaluation: zod_1.z.enum(["YES", "NO"]),
14
- reasoning: zod_1.z.string(),
15
- });
16
- const BatchEvaluationSchema = zod_1.z.array(EvaluationSchema);
5
+ const v3LegacyEvaluator_js_1 = require("./v3LegacyEvaluator.js");
6
+ const EVALUATOR_BACKEND_ENV = "STAGEHAND_EVALUATOR_BACKEND";
7
+ const DEFAULT_EVALUATOR_BACKEND = "legacy";
17
8
  class V3Evaluator {
18
- v3;
19
- modelName;
20
- modelClientOptions;
21
- silentLogger = () => { };
22
- constructor(v3, modelName, modelClientOptions) {
23
- this.v3 = v3;
24
- this.modelName = modelName || "google/gemini-2.5-flash";
25
- this.modelClientOptions = modelClientOptions || {
26
- apiKey: process.env.GEMINI_API_KEY ||
27
- process.env.GOOGLE_GENERATIVE_AI_API_KEY ||
28
- "",
29
- };
30
- }
31
- getClient() {
32
- // Prefer a dedicated provider so we can override model per-evaluation
33
- const provider = new LLMProvider_js_1.LLMProvider(this.v3.logger);
34
- return provider.getClient(this.modelName, this.modelClientOptions);
9
+ backend;
10
+ legacyEvaluator;
11
+ constructor(v3, modelNameOrOptions, modelClientOptions, options) {
12
+ const normalizedOptions = normalizeConstructorOptions(modelNameOrOptions, modelClientOptions, options);
13
+ this.backend = resolveEvaluatorBackend(normalizedOptions.backend);
14
+ this.legacyEvaluator = new v3LegacyEvaluator_js_1.LegacyV3Evaluator(v3, normalizedOptions.modelName, normalizedOptions.modelClientOptions);
35
15
  }
36
16
  async ask(options) {
37
- const { question, answer, screenshot = true, systemPrompt, screenshotDelayMs = 250, agentReasoning, } = options;
38
- if (!question)
39
- throw new sdkErrors_js_1.StagehandInvalidArgumentError("Question cannot be an empty string");
40
- if (!answer && !screenshot)
41
- throw new sdkErrors_js_1.StagehandInvalidArgumentError("Either answer (text) or screenshot must be provided");
42
- if (Array.isArray(screenshot)) {
43
- return this._evaluateWithMultipleScreenshots({
44
- question,
45
- screenshots: screenshot,
46
- systemPrompt,
47
- agentReasoning,
48
- });
49
- }
50
- const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO based on if the original goal was achieved. You have access to ${screenshot ? "a screenshot" : "the agents reasoning and actions throughout the task"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.\n Today's date is ${new Date().toLocaleDateString()}`;
51
- await new Promise((r) => setTimeout(r, screenshotDelayMs));
52
- let imageBuffer;
53
- if (screenshot) {
54
- const page = await this.v3.context.awaitActivePage();
55
- imageBuffer = await page.screenshot({ fullPage: false });
56
- }
57
- const llmClient = this.getClient();
58
- const response = await llmClient.createChatCompletion({
59
- logger: this.silentLogger,
60
- options: {
61
- messages: [
62
- { role: "system", content: systemPrompt || defaultSystemPrompt },
63
- {
64
- role: "user",
65
- content: [
66
- {
67
- type: "text",
68
- text: agentReasoning
69
- ? `Question: ${question}\n\nAgent's reasoning and actions taken:\n${agentReasoning}`
70
- : question,
71
- },
72
- ...(screenshot && imageBuffer
73
- ? [
74
- {
75
- type: "image_url",
76
- image_url: {
77
- url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
78
- },
79
- },
80
- ]
81
- : []),
82
- ...(answer
83
- ? [{ type: "text", text: `the answer is ${answer}` }]
84
- : []),
85
- ],
86
- },
87
- ],
88
- response_model: { name: "EvaluationResult", schema: EvaluationSchema },
89
- },
90
- });
91
- try {
92
- const result = response.data;
93
- return { evaluation: result.evaluation, reasoning: result.reasoning };
94
- }
95
- catch (error) {
96
- const errorMessage = error instanceof Error ? error.message : String(error);
97
- return {
98
- evaluation: "INVALID",
99
- reasoning: `Failed to get structured response: ${errorMessage}`,
100
- };
101
- }
17
+ return this.getLegacyBackend("ask").ask(options);
102
18
  }
103
19
  async batchAsk(options) {
104
- const { questions, screenshot = true, systemPrompt = "You are an expert evaluator that returns YES or NO with a concise reasoning.", screenshotDelayMs = 250, } = options;
105
- if (!questions?.length)
106
- throw new sdkErrors_js_1.StagehandInvalidArgumentError("Questions array cannot be empty");
107
- await new Promise((r) => setTimeout(r, screenshotDelayMs));
108
- let imageBuffer;
109
- if (screenshot) {
110
- const page = await this.v3.context.awaitActivePage();
111
- imageBuffer = await page.screenshot({ fullPage: false });
112
- }
113
- const llmClient = this.getClient();
114
- const formatted = questions
115
- .map((item, i) => `${i + 1}. ${item.question}${item.answer ? `\n Answer: ${item.answer}` : ""}`)
116
- .join("\n\n");
117
- const response = await llmClient.createChatCompletion({
118
- logger: this.silentLogger,
119
- options: {
120
- messages: [
121
- {
122
- role: "system",
123
- content: `${systemPrompt}\n\nYou will be given multiple questions${screenshot ? " with a screenshot" : ""}. ${questions.some((q) => q.answer) ? "Some questions include answers to evaluate." : ""} Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,
124
- },
125
- {
126
- role: "user",
127
- content: [
128
- { type: "text", text: formatted },
129
- ...(screenshot && imageBuffer
130
- ? [
131
- {
132
- type: "image_url",
133
- image_url: {
134
- url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
135
- },
136
- },
137
- ]
138
- : []),
139
- ],
140
- },
141
- ],
142
- response_model: {
143
- name: "BatchEvaluationResult",
144
- schema: BatchEvaluationSchema,
145
- },
146
- },
147
- });
148
- try {
149
- const results = response.data;
150
- return results.map((r) => ({
151
- evaluation: r.evaluation,
152
- reasoning: r.reasoning,
153
- }));
154
- }
155
- catch (error) {
156
- const errorMessage = error instanceof Error ? error.message : String(error);
157
- return questions.map(() => ({
158
- evaluation: "INVALID",
159
- reasoning: `Failed to get structured response: ${errorMessage}`,
160
- }));
161
- }
20
+ return this.getLegacyBackend("batchAsk").batchAsk(options);
162
21
  }
163
- async _evaluateWithMultipleScreenshots(options) {
164
- const { question, screenshots, agentReasoning, systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
165
- ${agentReasoning ? "You also have access to the agent's detailed reasoning and thought process throughout the task." : ""}
166
- Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.
167
- Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).
168
- ${agentReasoning ? "The agent's reasoning provides crucial context about what actions were attempted, what was observed, and the decision-making process. Use this alongside the visual evidence to make a comprehensive evaluation." : ""}
169
- Today's date is ${new Date().toLocaleDateString()}`, } = options;
170
- if (!question)
171
- throw new sdkErrors_js_1.StagehandInvalidArgumentError("Question cannot be an empty string");
172
- if (!screenshots || screenshots.length === 0)
173
- throw new sdkErrors_js_1.StagehandInvalidArgumentError("At least one screenshot must be provided");
174
- const llmClient = this.getClient();
175
- const imageContents = screenshots.map((s) => ({
176
- type: "image_url",
177
- image_url: { url: `data:image/jpeg;base64,${s.toString("base64")}` },
178
- }));
179
- const response = await llmClient.createChatCompletion({
180
- logger: this.silentLogger,
181
- options: {
182
- messages: [
183
- { role: "system", content: systemPrompt },
184
- {
185
- role: "user",
186
- content: [
187
- {
188
- type: "text",
189
- text: agentReasoning
190
- ? `Question: ${question}\n\nAgent's reasoning and actions throughout the task:\n${agentReasoning}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`
191
- : `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
192
- },
193
- ...imageContents,
194
- ],
195
- },
196
- ],
197
- response_model: { name: "EvaluationResult", schema: EvaluationSchema },
198
- },
199
- });
200
- try {
201
- const result = response.data;
202
- return { evaluation: result.evaluation, reasoning: result.reasoning };
203
- }
204
- catch (error) {
205
- const errorMessage = error instanceof Error ? error.message : String(error);
206
- return {
207
- evaluation: "INVALID",
208
- reasoning: `Failed to get structured response: ${errorMessage}`,
209
- };
22
+ getLegacyBackend(methodName) {
23
+ if (this.backend === "legacy") {
24
+ return this.legacyEvaluator;
210
25
  }
26
+ throw new sdkErrors_js_1.StagehandInvalidArgumentError(`V3Evaluator.${methodName}() was configured with ${EVALUATOR_BACKEND_ENV}=verifier, but the verifier backend is not available in this build. Use "legacy" or install the verifier backend PR.`);
211
27
  }
212
28
  }
213
29
  exports.V3Evaluator = V3Evaluator;
30
+ function normalizeConstructorOptions(modelNameOrOptions, modelClientOptions, options) {
31
+ if (modelNameOrOptions &&
32
+ typeof modelNameOrOptions === "object" &&
33
+ !Array.isArray(modelNameOrOptions)) {
34
+ return {
35
+ modelName: modelNameOrOptions.modelName,
36
+ modelClientOptions: modelNameOrOptions.modelClientOptions,
37
+ backend: modelNameOrOptions.backend ?? options?.backend,
38
+ };
39
+ }
40
+ return {
41
+ modelName: modelNameOrOptions,
42
+ modelClientOptions,
43
+ backend: options?.backend,
44
+ };
45
+ }
46
+ function resolveEvaluatorBackend(explicitBackend) {
47
+ const configuredBackend = explicitBackend ??
48
+ process.env[EVALUATOR_BACKEND_ENV] ??
49
+ DEFAULT_EVALUATOR_BACKEND;
50
+ const normalizedBackend = configuredBackend.trim().toLowerCase();
51
+ if (normalizedBackend === "legacy" || normalizedBackend === "verifier") {
52
+ return normalizedBackend;
53
+ }
54
+ throw new sdkErrors_js_1.StagehandInvalidArgumentError(`Invalid ${EVALUATOR_BACKEND_ENV}="${configuredBackend}". Expected "legacy" or "verifier".`);
55
+ }
214
56
  //# sourceMappingURL=v3Evaluator.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"v3Evaluator.js","sourceRoot":"","sources":["../../../lib/v3Evaluator.ts"],"names":[],"mappings":";AAAA;;;;GAIG;;;AAEH,6BAAwB;AAWxB,4DAAsD;AACtD,iEAA+E;AAE/E,MAAM,gBAAgB,GAAG,OAAC,CAAC,MAAM,CAAC;IAChC,UAAU,EAAE,OAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IACjC,SAAS,EAAE,OAAC,CAAC,MAAM,EAAE;CACtB,CAAC,CAAC;AAEH,MAAM,qBAAqB,GAAG,OAAC,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;AAExD,MAAa,WAAW;IACd,EAAE,CAAK;IACP,SAAS,CAAiB;IAC1B,kBAAkB,CAAqC;IACvD,YAAY,GAA+B,GAAG,EAAE,GAAE,CAAC,CAAC;IAE5D,YACE,EAAM,EACN,SAA0B,EAC1B,kBAAkC;QAElC,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC;QACb,IAAI,CAAC,SAAS,GAAG,SAAS,IAAK,yBAA4C,CAAC;QAC5E,IAAI,CAAC,kBAAkB,GAAG,kBAAkB,IAAI;YAC9C,MAAM,EACJ,OAAO,CAAC,GAAG,CAAC,cAAc;gBAC1B,OAAO,CAAC,GAAG,CAAC,4BAA4B;gBACxC,EAAE;SACL,CAAC;IACJ,CAAC;IAEO,SAAS;QACf,sEAAsE;QACtE,MAAM,QAAQ,GAAG,IAAI,4BAAW,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,CAAC,CAAC;QACjD,OAAO,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,kBAAkB,CAAC,CAAC;IACrE,CAAC;IAED,KAAK,CAAC,GAAG,CAAC,OAAwB;QAChC,MAAM,EACJ,QAAQ,EACR,MAAM,EACN,UAAU,GAAG,IAAI,EACjB,YAAY,EACZ,iBAAiB,GAAG,GAAG,EACvB,cAAc,GACf,GAAG,OAAO,CAAC;QACZ,IAAI,CAAC,QAAQ;YACX,MAAM,IAAI,4CAA6B,CACrC,oCAAoC,CACrC,CAAC;QACJ,IAAI,CAAC,MAAM,IAAI,CAAC,UAAU;YACxB,MAAM,IAAI,4CAA6B,CACrC,qDAAqD,CACtD,CAAC;QAEJ,IAAI,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAC,gCAAgC,CAAC;gBAC3C,QAAQ;gBACR,WAAW,EAAE,UAAU;gBACvB,YAAY;gBACZ,cAAc;aACf,CAAC,CAAC;QACL,CAAC;QAED,MAAM,mBAAmB,GAAG,kIAAkI,UAAU,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,sDAAsD,8HAA8H,IAAI,IAAI,EAAE,CAAC,kBAAkB,EAAE,EAAE,CAAC;QAElZ,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC,CAAC;QAC3D,IAAI,WAA+B,CAAC;QACpC,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;YACrD,WAAW,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,IAAI,mBAAmB,EAAE;oBAChE;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP;gCACE,IAAI,EAAE,MAAM;gCACZ,IAAI,EAAE,cAAc;oCAClB,CAAC,CAAC,aAAa,QAAQ,6CAA6C,cAAc,EAAE;oCACpF,CAAC,CAAC,QAAQ;6BACb;4BACD,GAAG,CAAC,UAAU,IAAI,WAAW;gCAC3B,CAAC,CAAC;oCACE;wCACE,IAAI,EAAE,WAAoB;wCAC1B,SAAS,EAAE;4CACT,GAAG,EAAE,0BAA0B,WAAW,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE;yCAChE;qCACF;iCACF;gCACH,CAAC,CAAC,EAAE,CAAC;4BACP,GAAG,CAAC,MAAM;gCACR,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,iBAAiB,MAAM,EAAE,EAAE,CAAC;gCAC9D,CAAC,CAAC,EAAE,CAAC;yBACR;qBACF;iBACF;gBACD,cAAc,EAAE,EAAE,IAAI,EAAE,kBAAkB,EAAE,MAAM,EAAE,gBAAgB,EAAE;aACvE;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,QAAQ,CAAC,IAEvB,CAAC;YACF,OAAO,EAAE,UAAU,EAAE,MAAM,CAAC,UAAU,EAAE,SAAS,EAAE,MAAM,CAAC,SAAS,EAAE,CAAC;QACxE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO;gBACL,UAAU,EAAE,SAAS;gBACrB,SAAS,EAAE,sCAAsC,YAAY,EAAE;aACvD,CAAC;QACb,CAAC;IACH,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,OAAwB;QACrC,MAAM,EACJ,SAAS,EACT,UAAU,GAAG,IAAI,EACjB,YAAY,GAAG,8EAA8E,EAC7F,iBAAiB,GAAG,GAAG,GACxB,GAAG,OAAO,CAAC;QACZ,IAAI,CAAC,SAAS,EAAE,MAAM;YACpB,MAAM,IAAI,4CAA6B,CACrC,iCAAiC,CAClC,CAAC;QAEJ,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC,CAAC;QAC3D,IAAI,WAA+B,CAAC;QACpC,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;YACrD,WAAW,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,SAAS,GAAG,SAAS;aACxB,GAAG,CACF,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CACV,GAAG,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,gBAAgB,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAClF;aACA,IAAI,CAAC,MAAM,CAAC,CAAC;QAEhB,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR;wBACE,IAAI,EAAE,QAAQ;wBACd,OAAO,EAAE,GAAG,YAAY,2CAA2C,UAAU,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,EAAE,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,6CAA6C,CAAC,CAAC,CAAC,EAAE,6KAA6K;qBAChX;oBACD;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE;4BACjC,GAAG,CAAC,UAAU,IAAI,WAAW;gCAC3B,CAAC,CAAC;oCACE;wCACE,IAAI,EAAE,WAAoB;wCAC1B,SAAS,EAAE;4CACT,GAAG,EAAE,0BAA0B,WAAW,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE;yCAChE;qCACF;iCACF;gCACH,CAAC,CAAC,EAAE,CAAC;yBACR;qBACF;iBACF;gBACD,cAAc,EAAE;oBACd,IAAI,EAAE,uBAAuB;oBAC7B,MAAM,EAAE,qBAAqB;iBAC9B;aACF;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,QAAQ,CAAC,IAExB,CAAC;YACF,OAAO,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBACzB,UAAU,EAAE,CAAC,CAAC,UAAU;gBACxB,SAAS,EAAE,CAAC,CAAC,SAAS;aACvB,CAAC,CAAC,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO,SAAS,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC;gBAC1B,UAAU,EAAE,SAAkB;gBAC9B,SAAS,EAAE,sCAAsC,YAAY,EAAE;aAChE,CAAC,CAAC,CAAC;QACN,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,gCAAgC,CAAC,OAK9C;QACC,MAAM,EACJ,QAAQ,EACR,WAAW,EACX,cAAc,EACd,YAAY,GAAG;UACX,cAAc,CAAC,CAAC,CAAC,iGAAiG,CAAC,CAAC,CAAC,EAAE;;;UAGvH,cAAc,CAAC,CAAC,CAAC,kNAAkN,CAAC,CAAC,CAAC,EAAE;0BACxN,IAAI,IAAI,EAAE,CAAC,kBAAkB,EAAE,EAAE,GACtD,GAAG,OAAO,CAAC;QAEZ,IAAI,CAAC,QAAQ;YACX,MAAM,IAAI,4CAA6B,CACrC,oCAAoC,CACrC,CAAC;QACJ,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC;YAC1C,MAAM,IAAI,4CAA6B,CACrC,0CAA0C,CAC3C,CAAC;QAEJ,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,aAAa,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YAC5C,IAAI,EAAE,WAAoB;YAC1B,SAAS,EAAE,EAAE,GAAG,EAAE,0BAA0B,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,EAAE;SACrE,CAAC,CAAC,CAAC;QAEJ,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,EAAE;oBACzC;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP;gCACE,IAAI,EAAE,MAAM;gCACZ,IAAI,EAAE,cAAc;oCAClB,CAAC,CAAC,aAAa,QAAQ,2DAA2D,cAAc,qBAAqB,WAAW,CAAC,MAAM,sKAAsK;oCAC7S,CAAC,CAAC,GAAG,QAAQ,qBAAqB,WAAW,CAAC,MAAM,mIAAmI;6BAC1L;4BACD,GAAG,aAAa;yBACjB;qBACF;iBACF;gBACD,cAAc,EAAE,EAAE,IAAI,EAAE,kBAAkB,EAAE,MAAM,EAAE,gBAAgB,EAAE;aACvE;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,QAAQ,CAAC,IAEvB,CAAC;YACF,OAAO,EAAE,UAAU,EAAE,MAAM,CAAC,UAAU,EAAE,SAAS,EAAE,MAAM,CAAC,SAAS,EAAE,CAAC;QACxE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO;gBACL,UAAU,EAAE,SAAS;gBACrB,SAAS,EAAE,sCAAsC,YAAY,EAAE;aACvD,CAAC;QACb,CAAC;IACH,CAAC;CACF;AA5QD,kCA4QC","sourcesContent":["/**\n * V3Evaluator mirrors Evaluator but operates on a V3 instance instead of Stagehand.\n * It uses the V3 page/screenshot APIs and constructs an LLM client to run\n * structured evaluations (YES/NO with reasoning) on screenshots and/or text.\n */\n\nimport { z } from \"zod\";\nimport type { AvailableModel, ClientOptions } from \"./v3/types/public/model.js\";\nimport type {\n EvaluateOptions,\n BatchAskOptions,\n EvaluationResult,\n} from \"./v3/types/private/evaluator.js\";\nimport { LLMParsedResponse } from \"./inference.js\";\nimport { LLMResponse, LLMClient } from \"./v3/llm/LLMClient.js\";\nimport { LogLine } from \"./v3/types/public/logs.js\";\nimport { V3 } from \"./v3/v3.js\";\nimport { LLMProvider } from \"./v3/llm/LLMProvider.js\";\nimport { StagehandInvalidArgumentError } from \"./v3/types/public/sdkErrors.js\";\n\nconst EvaluationSchema = z.object({\n evaluation: z.enum([\"YES\", \"NO\"]),\n reasoning: z.string(),\n});\n\nconst BatchEvaluationSchema = z.array(EvaluationSchema);\n\nexport class V3Evaluator {\n private v3: V3;\n private modelName: AvailableModel;\n private modelClientOptions: ClientOptions | { apiKey: string };\n private silentLogger: (message: LogLine) => void = () => {};\n\n constructor(\n v3: V3,\n modelName?: AvailableModel,\n modelClientOptions?: ClientOptions,\n ) {\n this.v3 = v3;\n this.modelName = modelName || (\"google/gemini-2.5-flash\" as AvailableModel);\n this.modelClientOptions = modelClientOptions || {\n apiKey:\n process.env.GEMINI_API_KEY ||\n process.env.GOOGLE_GENERATIVE_AI_API_KEY ||\n \"\",\n };\n }\n\n private getClient(): LLMClient {\n // Prefer a dedicated provider so we can override model per-evaluation\n const provider = new LLMProvider(this.v3.logger);\n return provider.getClient(this.modelName, this.modelClientOptions);\n }\n\n async ask(options: EvaluateOptions): Promise<EvaluationResult> {\n const {\n question,\n answer,\n screenshot = true,\n systemPrompt,\n screenshotDelayMs = 250,\n agentReasoning,\n } = options;\n if (!question)\n throw new StagehandInvalidArgumentError(\n \"Question cannot be an empty string\",\n );\n if (!answer && !screenshot)\n throw new StagehandInvalidArgumentError(\n \"Either answer (text) or screenshot must be provided\",\n );\n\n if (Array.isArray(screenshot)) {\n return this._evaluateWithMultipleScreenshots({\n question,\n screenshots: screenshot,\n systemPrompt,\n agentReasoning,\n });\n }\n\n const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO based on if the original goal was achieved. You have access to ${screenshot ? \"a screenshot\" : \"the agents reasoning and actions throughout the task\"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.\\n Today's date is ${new Date().toLocaleDateString()}`;\n\n await new Promise((r) => setTimeout(r, screenshotDelayMs));\n let imageBuffer: Buffer | undefined;\n if (screenshot) {\n const page = await this.v3.context.awaitActivePage();\n imageBuffer = await page.screenshot({ fullPage: false });\n }\n\n const llmClient = this.getClient();\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n { role: \"system\", content: systemPrompt || defaultSystemPrompt },\n {\n role: \"user\",\n content: [\n {\n type: \"text\",\n text: agentReasoning\n ? `Question: ${question}\\n\\nAgent's reasoning and actions taken:\\n${agentReasoning}`\n : question,\n },\n ...(screenshot && imageBuffer\n ? [\n {\n type: \"image_url\" as const,\n image_url: {\n url: `data:image/jpeg;base64,${imageBuffer.toString(\"base64\")}`,\n },\n },\n ]\n : []),\n ...(answer\n ? [{ type: \"text\" as const, text: `the answer is ${answer}` }]\n : []),\n ],\n },\n ],\n response_model: { name: \"EvaluationResult\", schema: EvaluationSchema },\n },\n });\n\n try {\n const result = response.data as unknown as z.infer<\n typeof EvaluationSchema\n >;\n return { evaluation: result.evaluation, reasoning: result.reasoning };\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return {\n evaluation: \"INVALID\",\n reasoning: `Failed to get structured response: ${errorMessage}`,\n } as const;\n }\n }\n\n async batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]> {\n const {\n questions,\n screenshot = true,\n systemPrompt = \"You are an expert evaluator that returns YES or NO with a concise reasoning.\",\n screenshotDelayMs = 250,\n } = options;\n if (!questions?.length)\n throw new StagehandInvalidArgumentError(\n \"Questions array cannot be empty\",\n );\n\n await new Promise((r) => setTimeout(r, screenshotDelayMs));\n let imageBuffer: Buffer | undefined;\n if (screenshot) {\n const page = await this.v3.context.awaitActivePage();\n imageBuffer = await page.screenshot({ fullPage: false });\n }\n\n const llmClient = this.getClient();\n\n const formatted = questions\n .map(\n (item, i) =>\n `${i + 1}. ${item.question}${item.answer ? `\\n Answer: ${item.answer}` : \"\"}`,\n )\n .join(\"\\n\\n\");\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n {\n role: \"system\",\n content: `${systemPrompt}\\n\\nYou will be given multiple questions${screenshot ? \" with a screenshot\" : \"\"}. ${questions.some((q) => q.answer) ? \"Some questions include answers to evaluate.\" : \"\"} Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,\n },\n {\n role: \"user\",\n content: [\n { type: \"text\", text: formatted },\n ...(screenshot && imageBuffer\n ? [\n {\n type: \"image_url\" as const,\n image_url: {\n url: `data:image/jpeg;base64,${imageBuffer.toString(\"base64\")}`,\n },\n },\n ]\n : []),\n ],\n },\n ],\n response_model: {\n name: \"BatchEvaluationResult\",\n schema: BatchEvaluationSchema,\n },\n },\n });\n\n try {\n const results = response.data as unknown as z.infer<\n typeof BatchEvaluationSchema\n >;\n return results.map((r) => ({\n evaluation: r.evaluation,\n reasoning: r.reasoning,\n }));\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return questions.map(() => ({\n evaluation: \"INVALID\" as const,\n reasoning: `Failed to get structured response: ${errorMessage}`,\n }));\n }\n }\n\n private async _evaluateWithMultipleScreenshots(options: {\n question: string;\n screenshots: Buffer[];\n systemPrompt?: string;\n agentReasoning?: string;\n }): Promise<EvaluationResult> {\n const {\n question,\n screenshots,\n agentReasoning,\n systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.\n ${agentReasoning ? \"You also have access to the agent's detailed reasoning and thought process throughout the task.\" : \"\"}\n Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.\n Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).\n ${agentReasoning ? \"The agent's reasoning provides crucial context about what actions were attempted, what was observed, and the decision-making process. Use this alongside the visual evidence to make a comprehensive evaluation.\" : \"\"}\n Today's date is ${new Date().toLocaleDateString()}`,\n } = options;\n\n if (!question)\n throw new StagehandInvalidArgumentError(\n \"Question cannot be an empty string\",\n );\n if (!screenshots || screenshots.length === 0)\n throw new StagehandInvalidArgumentError(\n \"At least one screenshot must be provided\",\n );\n\n const llmClient = this.getClient();\n\n const imageContents = screenshots.map((s) => ({\n type: \"image_url\" as const,\n image_url: { url: `data:image/jpeg;base64,${s.toString(\"base64\")}` },\n }));\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n { role: \"system\", content: systemPrompt },\n {\n role: \"user\",\n content: [\n {\n type: \"text\",\n text: agentReasoning\n ? `Question: ${question}\\n\\nAgent's reasoning and actions throughout the task:\\n${agentReasoning}\\n\\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`\n : `${question}\\n\\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,\n },\n ...imageContents,\n ],\n },\n ],\n response_model: { name: \"EvaluationResult\", schema: EvaluationSchema },\n },\n });\n\n try {\n const result = response.data as unknown as z.infer<\n typeof EvaluationSchema\n >;\n return { evaluation: result.evaluation, reasoning: result.reasoning };\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return {\n evaluation: \"INVALID\",\n reasoning: `Failed to get structured response: ${errorMessage}`,\n } as const;\n }\n }\n}\n"]}
1
+ {"version":3,"file":"v3Evaluator.js","sourceRoot":"","sources":["../../../lib/v3Evaluator.ts"],"names":[],"mappings":";;;AAOA,iEAA+E;AAC/E,iEAA2D;AAE3D,MAAM,qBAAqB,GAAG,6BAA6B,CAAC;AAC5D,MAAM,yBAAyB,GAAuB,QAAQ,CAAC;AA2B/D,MAAa,WAAW;IACL,OAAO,CAAqB;IAC5B,eAAe,CAAoB;IAEpD,YACE,EAAM,EACN,kBAAmE,EACnE,kBAAkC,EAClC,OAA4B;QAE5B,MAAM,iBAAiB,GAAG,2BAA2B,CACnD,kBAAkB,EAClB,kBAAkB,EAClB,OAAO,CACR,CAAC;QAEF,IAAI,CAAC,OAAO,GAAG,uBAAuB,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC;QAClE,IAAI,CAAC,eAAe,GAAG,IAAI,wCAAiB,CAC1C,EAAE,EACF,iBAAiB,CAAC,SAAS,EAC3B,iBAAiB,CAAC,kBAAkB,CACrC,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,GAAG,CAAC,OAAwB;QAChC,OAAO,IAAI,CAAC,gBAAgB,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IACnD,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,OAAwB;QACrC,OAAO,IAAI,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;IAC7D,CAAC;IAEO,gBAAgB,CAAC,UAAkB;QACzC,IAAI,IAAI,CAAC,OAAO,KAAK,QAAQ,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAC,eAAe,CAAC;QAC9B,CAAC;QAED,MAAM,IAAI,4CAA6B,CACrC,eAAe,UAAU,0BAA0B,qBAAqB,sHAAsH,CAC/L,CAAC;IACJ,CAAC;CACF;AAzCD,kCAyCC;AAED,SAAS,2BAA2B,CAClC,kBAAmE,EACnE,kBAAkC,EAClC,OAA4B;IAE5B,IACE,kBAAkB;QAClB,OAAO,kBAAkB,KAAK,QAAQ;QACtC,CAAC,KAAK,CAAC,OAAO,CAAC,kBAAkB,CAAC,EAClC,CAAC;QACD,OAAO;YACL,SAAS,EAAE,kBAAkB,CAAC,SAAS;YACvC,kBAAkB,EAAE,kBAAkB,CAAC,kBAAkB;YACzD,OAAO,EAAE,kBAAkB,CAAC,OAAO,IAAI,OAAO,EAAE,OAAO;SACxD,CAAC;IACJ,CAAC;IAED,OAAO;QACL,SAAS,EAAE,kBAAgD;QAC3D,kBAAkB;QAClB,OAAO,EAAE,OAAO,EAAE,OAAO;KAC1B,CAAC;AACJ,CAAC;AAED,SAAS,uBAAuB,CAC9B,eAAoC;IAEpC,MAAM,iBAAiB,GACrB,eAAe;QACf,OAAO,CAAC,GAAG,CAAC,qBAAqB,CAAC;QAClC,yBAAyB,CAAC;IAC5B,MAAM,iBAAiB,GAAG,iBAAiB,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAEjE,IAAI,iBAAiB,KAAK,QAAQ,IAAI,iBAAiB,KAAK,UAAU,EAAE,CAAC;QACvE,OAAO,iBAAiB,CAAC;IAC3B,CAAC;IAED,MAAM,IAAI,4CAA6B,CACrC,WAAW,qBAAqB,KAAK,iBAAiB,qCAAqC,CAC5F,CAAC;AACJ,CAAC","sourcesContent":["import type { AvailableModel, ClientOptions } from \"./v3/types/public/model.js\";\nimport type {\n EvaluateOptions,\n BatchAskOptions,\n EvaluationResult,\n} from \"./v3/types/private/evaluator.js\";\nimport { V3 } from \"./v3/v3.js\";\nimport { StagehandInvalidArgumentError } from \"./v3/types/public/sdkErrors.js\";\nimport { LegacyV3Evaluator } from \"./v3LegacyEvaluator.js\";\n\nconst EVALUATOR_BACKEND_ENV = \"STAGEHAND_EVALUATOR_BACKEND\";\nconst DEFAULT_EVALUATOR_BACKEND: V3EvaluatorBackend = \"legacy\";\n\nexport type V3EvaluatorBackend = \"legacy\" | \"verifier\";\n\nexport type V3EvaluatorOptions = {\n /**\n * Selects the evaluator implementation.\n *\n * \"legacy\" preserves the existing screenshot/text YES/NO evaluator.\n * \"verifier\" is reserved for the rubric verifier backend.\n *\n * @default process.env.STAGEHAND_EVALUATOR_BACKEND || \"legacy\"\n */\n backend?: V3EvaluatorBackend;\n};\n\nexport type V3EvaluatorConstructorOptions = V3EvaluatorOptions & {\n modelName?: AvailableModel;\n modelClientOptions?: ClientOptions;\n};\n\ntype NormalizedConstructorOptions = {\n modelName?: AvailableModel;\n modelClientOptions?: ClientOptions;\n backend?: V3EvaluatorBackend;\n};\n\nexport class V3Evaluator {\n private readonly backend: V3EvaluatorBackend;\n private readonly legacyEvaluator: LegacyV3Evaluator;\n\n constructor(\n v3: V3,\n modelNameOrOptions?: AvailableModel | V3EvaluatorConstructorOptions,\n modelClientOptions?: ClientOptions,\n options?: V3EvaluatorOptions,\n ) {\n const normalizedOptions = normalizeConstructorOptions(\n modelNameOrOptions,\n modelClientOptions,\n options,\n );\n\n this.backend = resolveEvaluatorBackend(normalizedOptions.backend);\n this.legacyEvaluator = new LegacyV3Evaluator(\n v3,\n normalizedOptions.modelName,\n normalizedOptions.modelClientOptions,\n );\n }\n\n async ask(options: EvaluateOptions): Promise<EvaluationResult> {\n return this.getLegacyBackend(\"ask\").ask(options);\n }\n\n async batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]> {\n return this.getLegacyBackend(\"batchAsk\").batchAsk(options);\n }\n\n private getLegacyBackend(methodName: string): LegacyV3Evaluator {\n if (this.backend === \"legacy\") {\n return this.legacyEvaluator;\n }\n\n throw new StagehandInvalidArgumentError(\n `V3Evaluator.${methodName}() was configured with ${EVALUATOR_BACKEND_ENV}=verifier, but the verifier backend is not available in this build. Use \"legacy\" or install the verifier backend PR.`,\n );\n }\n}\n\nfunction normalizeConstructorOptions(\n modelNameOrOptions?: AvailableModel | V3EvaluatorConstructorOptions,\n modelClientOptions?: ClientOptions,\n options?: V3EvaluatorOptions,\n): NormalizedConstructorOptions {\n if (\n modelNameOrOptions &&\n typeof modelNameOrOptions === \"object\" &&\n !Array.isArray(modelNameOrOptions)\n ) {\n return {\n modelName: modelNameOrOptions.modelName,\n modelClientOptions: modelNameOrOptions.modelClientOptions,\n backend: modelNameOrOptions.backend ?? options?.backend,\n };\n }\n\n return {\n modelName: modelNameOrOptions as AvailableModel | undefined,\n modelClientOptions,\n backend: options?.backend,\n };\n}\n\nfunction resolveEvaluatorBackend(\n explicitBackend?: V3EvaluatorBackend,\n): V3EvaluatorBackend {\n const configuredBackend =\n explicitBackend ??\n process.env[EVALUATOR_BACKEND_ENV] ??\n DEFAULT_EVALUATOR_BACKEND;\n const normalizedBackend = configuredBackend.trim().toLowerCase();\n\n if (normalizedBackend === \"legacy\" || normalizedBackend === \"verifier\") {\n return normalizedBackend;\n }\n\n throw new StagehandInvalidArgumentError(\n `Invalid ${EVALUATOR_BACKEND_ENV}=\"${configuredBackend}\". Expected \"legacy\" or \"verifier\".`,\n );\n}\n"]}
@@ -0,0 +1,20 @@
1
+ /**
2
+ * Legacy V3 evaluator implementation.
3
+ *
4
+ * This is the behavior-preserving implementation that backs V3Evaluator when
5
+ * STAGEHAND_EVALUATOR_BACKEND=legacy.
6
+ */
7
+ import type { AvailableModel, ClientOptions } from "./v3/types/public/model.js";
8
+ import type { EvaluateOptions, BatchAskOptions, EvaluationResult } from "./v3/types/private/evaluator.js";
9
+ import { V3 } from "./v3/v3.js";
10
+ export declare class LegacyV3Evaluator {
11
+ private v3;
12
+ private modelName;
13
+ private modelClientOptions;
14
+ private silentLogger;
15
+ constructor(v3: V3, modelName?: AvailableModel, modelClientOptions?: ClientOptions);
16
+ private getClient;
17
+ ask(options: EvaluateOptions): Promise<EvaluationResult>;
18
+ batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]>;
19
+ private _evaluateWithMultipleScreenshots;
20
+ }
@@ -0,0 +1,215 @@
1
+ "use strict";
2
+ /**
3
+ * Legacy V3 evaluator implementation.
4
+ *
5
+ * This is the behavior-preserving implementation that backs V3Evaluator when
6
+ * STAGEHAND_EVALUATOR_BACKEND=legacy.
7
+ */
8
+ Object.defineProperty(exports, "__esModule", { value: true });
9
+ exports.LegacyV3Evaluator = void 0;
10
+ const zod_1 = require("zod");
11
+ const LLMProvider_js_1 = require("./v3/llm/LLMProvider.js");
12
+ const sdkErrors_js_1 = require("./v3/types/public/sdkErrors.js");
13
+ const EvaluationSchema = zod_1.z.object({
14
+ evaluation: zod_1.z.enum(["YES", "NO"]),
15
+ reasoning: zod_1.z.string(),
16
+ });
17
+ const BatchEvaluationSchema = zod_1.z.array(EvaluationSchema);
18
+ class LegacyV3Evaluator {
19
+ v3;
20
+ modelName;
21
+ modelClientOptions;
22
+ silentLogger = () => { };
23
+ constructor(v3, modelName, modelClientOptions) {
24
+ this.v3 = v3;
25
+ this.modelName = modelName || "google/gemini-2.5-flash";
26
+ this.modelClientOptions = modelClientOptions || {
27
+ apiKey: process.env.GEMINI_API_KEY ||
28
+ process.env.GOOGLE_GENERATIVE_AI_API_KEY ||
29
+ "",
30
+ };
31
+ }
32
+ getClient() {
33
+ // Prefer a dedicated provider so we can override model per-evaluation
34
+ const provider = new LLMProvider_js_1.LLMProvider(this.v3.logger);
35
+ return provider.getClient(this.modelName, this.modelClientOptions);
36
+ }
37
+ async ask(options) {
38
+ const { question, answer, screenshot = true, systemPrompt, screenshotDelayMs = 250, agentReasoning, } = options;
39
+ if (!question)
40
+ throw new sdkErrors_js_1.StagehandInvalidArgumentError("Question cannot be an empty string");
41
+ if (!answer && !screenshot)
42
+ throw new sdkErrors_js_1.StagehandInvalidArgumentError("Either answer (text) or screenshot must be provided");
43
+ if (Array.isArray(screenshot)) {
44
+ return this._evaluateWithMultipleScreenshots({
45
+ question,
46
+ screenshots: screenshot,
47
+ systemPrompt,
48
+ agentReasoning,
49
+ });
50
+ }
51
+ const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO based on if the original goal was achieved. You have access to ${screenshot ? "a screenshot" : "the agents reasoning and actions throughout the task"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.\n Today's date is ${new Date().toLocaleDateString()}`;
52
+ await new Promise((r) => setTimeout(r, screenshotDelayMs));
53
+ let imageBuffer;
54
+ if (screenshot) {
55
+ const page = await this.v3.context.awaitActivePage();
56
+ imageBuffer = await page.screenshot({ fullPage: false });
57
+ }
58
+ const llmClient = this.getClient();
59
+ const response = await llmClient.createChatCompletion({
60
+ logger: this.silentLogger,
61
+ options: {
62
+ messages: [
63
+ { role: "system", content: systemPrompt || defaultSystemPrompt },
64
+ {
65
+ role: "user",
66
+ content: [
67
+ {
68
+ type: "text",
69
+ text: agentReasoning
70
+ ? `Question: ${question}\n\nAgent's reasoning and actions taken:\n${agentReasoning}`
71
+ : question,
72
+ },
73
+ ...(screenshot && imageBuffer
74
+ ? [
75
+ {
76
+ type: "image_url",
77
+ image_url: {
78
+ url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
79
+ },
80
+ },
81
+ ]
82
+ : []),
83
+ ...(answer
84
+ ? [{ type: "text", text: `the answer is ${answer}` }]
85
+ : []),
86
+ ],
87
+ },
88
+ ],
89
+ response_model: { name: "EvaluationResult", schema: EvaluationSchema },
90
+ },
91
+ });
92
+ try {
93
+ const result = response.data;
94
+ return { evaluation: result.evaluation, reasoning: result.reasoning };
95
+ }
96
+ catch (error) {
97
+ const errorMessage = error instanceof Error ? error.message : String(error);
98
+ return {
99
+ evaluation: "INVALID",
100
+ reasoning: `Failed to get structured response: ${errorMessage}`,
101
+ };
102
+ }
103
+ }
104
+ async batchAsk(options) {
105
+ const { questions, screenshot = true, systemPrompt = "You are an expert evaluator that returns YES or NO with a concise reasoning.", screenshotDelayMs = 250, } = options;
106
+ if (!questions?.length)
107
+ throw new sdkErrors_js_1.StagehandInvalidArgumentError("Questions array cannot be empty");
108
+ await new Promise((r) => setTimeout(r, screenshotDelayMs));
109
+ let imageBuffer;
110
+ if (screenshot) {
111
+ const page = await this.v3.context.awaitActivePage();
112
+ imageBuffer = await page.screenshot({ fullPage: false });
113
+ }
114
+ const llmClient = this.getClient();
115
+ const formatted = questions
116
+ .map((item, i) => `${i + 1}. ${item.question}${item.answer ? `\n Answer: ${item.answer}` : ""}`)
117
+ .join("\n\n");
118
+ const response = await llmClient.createChatCompletion({
119
+ logger: this.silentLogger,
120
+ options: {
121
+ messages: [
122
+ {
123
+ role: "system",
124
+ content: `${systemPrompt}\n\nYou will be given multiple questions${screenshot ? " with a screenshot" : ""}. ${questions.some((q) => q.answer) ? "Some questions include answers to evaluate." : ""} Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,
125
+ },
126
+ {
127
+ role: "user",
128
+ content: [
129
+ { type: "text", text: formatted },
130
+ ...(screenshot && imageBuffer
131
+ ? [
132
+ {
133
+ type: "image_url",
134
+ image_url: {
135
+ url: `data:image/jpeg;base64,${imageBuffer.toString("base64")}`,
136
+ },
137
+ },
138
+ ]
139
+ : []),
140
+ ],
141
+ },
142
+ ],
143
+ response_model: {
144
+ name: "BatchEvaluationResult",
145
+ schema: BatchEvaluationSchema,
146
+ },
147
+ },
148
+ });
149
+ try {
150
+ const results = response.data;
151
+ return results.map((r) => ({
152
+ evaluation: r.evaluation,
153
+ reasoning: r.reasoning,
154
+ }));
155
+ }
156
+ catch (error) {
157
+ const errorMessage = error instanceof Error ? error.message : String(error);
158
+ return questions.map(() => ({
159
+ evaluation: "INVALID",
160
+ reasoning: `Failed to get structured response: ${errorMessage}`,
161
+ }));
162
+ }
163
+ }
164
+ async _evaluateWithMultipleScreenshots(options) {
165
+ const { question, screenshots, agentReasoning, systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
166
+ ${agentReasoning ? "You also have access to the agent's detailed reasoning and thought process throughout the task." : ""}
167
+ Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.
168
+ Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).
169
+ ${agentReasoning ? "The agent's reasoning provides crucial context about what actions were attempted, what was observed, and the decision-making process. Use this alongside the visual evidence to make a comprehensive evaluation." : ""}
170
+ Today's date is ${new Date().toLocaleDateString()}`, } = options;
171
+ if (!question)
172
+ throw new sdkErrors_js_1.StagehandInvalidArgumentError("Question cannot be an empty string");
173
+ if (!screenshots || screenshots.length === 0)
174
+ throw new sdkErrors_js_1.StagehandInvalidArgumentError("At least one screenshot must be provided");
175
+ const llmClient = this.getClient();
176
+ const imageContents = screenshots.map((s) => ({
177
+ type: "image_url",
178
+ image_url: { url: `data:image/jpeg;base64,${s.toString("base64")}` },
179
+ }));
180
+ const response = await llmClient.createChatCompletion({
181
+ logger: this.silentLogger,
182
+ options: {
183
+ messages: [
184
+ { role: "system", content: systemPrompt },
185
+ {
186
+ role: "user",
187
+ content: [
188
+ {
189
+ type: "text",
190
+ text: agentReasoning
191
+ ? `Question: ${question}\n\nAgent's reasoning and actions throughout the task:\n${agentReasoning}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`
192
+ : `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
193
+ },
194
+ ...imageContents,
195
+ ],
196
+ },
197
+ ],
198
+ response_model: { name: "EvaluationResult", schema: EvaluationSchema },
199
+ },
200
+ });
201
+ try {
202
+ const result = response.data;
203
+ return { evaluation: result.evaluation, reasoning: result.reasoning };
204
+ }
205
+ catch (error) {
206
+ const errorMessage = error instanceof Error ? error.message : String(error);
207
+ return {
208
+ evaluation: "INVALID",
209
+ reasoning: `Failed to get structured response: ${errorMessage}`,
210
+ };
211
+ }
212
+ }
213
+ }
214
+ exports.LegacyV3Evaluator = LegacyV3Evaluator;
215
+ //# sourceMappingURL=v3LegacyEvaluator.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"v3LegacyEvaluator.js","sourceRoot":"","sources":["../../../lib/v3LegacyEvaluator.ts"],"names":[],"mappings":";AAAA;;;;;GAKG;;;AAEH,6BAAwB;AAWxB,4DAAsD;AACtD,iEAA+E;AAE/E,MAAM,gBAAgB,GAAG,OAAC,CAAC,MAAM,CAAC;IAChC,UAAU,EAAE,OAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;IACjC,SAAS,EAAE,OAAC,CAAC,MAAM,EAAE;CACtB,CAAC,CAAC;AAEH,MAAM,qBAAqB,GAAG,OAAC,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC;AAExD,MAAa,iBAAiB;IACpB,EAAE,CAAK;IACP,SAAS,CAAiB;IAC1B,kBAAkB,CAAqC;IACvD,YAAY,GAA+B,GAAG,EAAE,GAAE,CAAC,CAAC;IAE5D,YACE,EAAM,EACN,SAA0B,EAC1B,kBAAkC;QAElC,IAAI,CAAC,EAAE,GAAG,EAAE,CAAC;QACb,IAAI,CAAC,SAAS,GAAG,SAAS,IAAK,yBAA4C,CAAC;QAC5E,IAAI,CAAC,kBAAkB,GAAG,kBAAkB,IAAI;YAC9C,MAAM,EACJ,OAAO,CAAC,GAAG,CAAC,cAAc;gBAC1B,OAAO,CAAC,GAAG,CAAC,4BAA4B;gBACxC,EAAE;SACL,CAAC;IACJ,CAAC;IAEO,SAAS;QACf,sEAAsE;QACtE,MAAM,QAAQ,GAAG,IAAI,4BAAW,CAAC,IAAI,CAAC,EAAE,CAAC,MAAM,CAAC,CAAC;QACjD,OAAO,QAAQ,CAAC,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,kBAAkB,CAAC,CAAC;IACrE,CAAC;IAED,KAAK,CAAC,GAAG,CAAC,OAAwB;QAChC,MAAM,EACJ,QAAQ,EACR,MAAM,EACN,UAAU,GAAG,IAAI,EACjB,YAAY,EACZ,iBAAiB,GAAG,GAAG,EACvB,cAAc,GACf,GAAG,OAAO,CAAC;QACZ,IAAI,CAAC,QAAQ;YACX,MAAM,IAAI,4CAA6B,CACrC,oCAAoC,CACrC,CAAC;QACJ,IAAI,CAAC,MAAM,IAAI,CAAC,UAAU;YACxB,MAAM,IAAI,4CAA6B,CACrC,qDAAqD,CACtD,CAAC;QAEJ,IAAI,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAC,gCAAgC,CAAC;gBAC3C,QAAQ;gBACR,WAAW,EAAE,UAAU;gBACvB,YAAY;gBACZ,cAAc;aACf,CAAC,CAAC;QACL,CAAC;QAED,MAAM,mBAAmB,GAAG,kIAAkI,UAAU,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,sDAAsD,8HAA8H,IAAI,IAAI,EAAE,CAAC,kBAAkB,EAAE,EAAE,CAAC;QAElZ,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC,CAAC;QAC3D,IAAI,WAA+B,CAAC;QACpC,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;YACrD,WAAW,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,IAAI,mBAAmB,EAAE;oBAChE;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP;gCACE,IAAI,EAAE,MAAM;gCACZ,IAAI,EAAE,cAAc;oCAClB,CAAC,CAAC,aAAa,QAAQ,6CAA6C,cAAc,EAAE;oCACpF,CAAC,CAAC,QAAQ;6BACb;4BACD,GAAG,CAAC,UAAU,IAAI,WAAW;gCAC3B,CAAC,CAAC;oCACE;wCACE,IAAI,EAAE,WAAoB;wCAC1B,SAAS,EAAE;4CACT,GAAG,EAAE,0BAA0B,WAAW,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE;yCAChE;qCACF;iCACF;gCACH,CAAC,CAAC,EAAE,CAAC;4BACP,GAAG,CAAC,MAAM;gCACR,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,iBAAiB,MAAM,EAAE,EAAE,CAAC;gCAC9D,CAAC,CAAC,EAAE,CAAC;yBACR;qBACF;iBACF;gBACD,cAAc,EAAE,EAAE,IAAI,EAAE,kBAAkB,EAAE,MAAM,EAAE,gBAAgB,EAAE;aACvE;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,QAAQ,CAAC,IAEvB,CAAC;YACF,OAAO,EAAE,UAAU,EAAE,MAAM,CAAC,UAAU,EAAE,SAAS,EAAE,MAAM,CAAC,SAAS,EAAE,CAAC;QACxE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO;gBACL,UAAU,EAAE,SAAS;gBACrB,SAAS,EAAE,sCAAsC,YAAY,EAAE;aACvD,CAAC;QACb,CAAC;IACH,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,OAAwB;QACrC,MAAM,EACJ,SAAS,EACT,UAAU,GAAG,IAAI,EACjB,YAAY,GAAG,8EAA8E,EAC7F,iBAAiB,GAAG,GAAG,GACxB,GAAG,OAAO,CAAC;QACZ,IAAI,CAAC,SAAS,EAAE,MAAM;YACpB,MAAM,IAAI,4CAA6B,CACrC,iCAAiC,CAClC,CAAC;QAEJ,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,iBAAiB,CAAC,CAAC,CAAC;QAC3D,IAAI,WAA+B,CAAC;QACpC,IAAI,UAAU,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;YACrD,WAAW,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC,CAAC;QAC3D,CAAC;QAED,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,SAAS,GAAG,SAAS;aACxB,GAAG,CACF,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE,CACV,GAAG,CAAC,GAAG,CAAC,KAAK,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,gBAAgB,IAAI,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAClF;aACA,IAAI,CAAC,MAAM,CAAC,CAAC;QAEhB,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR;wBACE,IAAI,EAAE,QAAQ;wBACd,OAAO,EAAE,GAAG,YAAY,2CAA2C,UAAU,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,EAAE,KAAK,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,6CAA6C,CAAC,CAAC,CAAC,EAAE,6KAA6K;qBAChX;oBACD;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,EAAE;4BACjC,GAAG,CAAC,UAAU,IAAI,WAAW;gCAC3B,CAAC,CAAC;oCACE;wCACE,IAAI,EAAE,WAAoB;wCAC1B,SAAS,EAAE;4CACT,GAAG,EAAE,0BAA0B,WAAW,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE;yCAChE;qCACF;iCACF;gCACH,CAAC,CAAC,EAAE,CAAC;yBACR;qBACF;iBACF;gBACD,cAAc,EAAE;oBACd,IAAI,EAAE,uBAAuB;oBAC7B,MAAM,EAAE,qBAAqB;iBAC9B;aACF;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,OAAO,GAAG,QAAQ,CAAC,IAExB,CAAC;YACF,OAAO,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;gBACzB,UAAU,EAAE,CAAC,CAAC,UAAU;gBACxB,SAAS,EAAE,CAAC,CAAC,SAAS;aACvB,CAAC,CAAC,CAAC;QACN,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO,SAAS,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC;gBAC1B,UAAU,EAAE,SAAkB;gBAC9B,SAAS,EAAE,sCAAsC,YAAY,EAAE;aAChE,CAAC,CAAC,CAAC;QACN,CAAC;IACH,CAAC;IAEO,KAAK,CAAC,gCAAgC,CAAC,OAK9C;QACC,MAAM,EACJ,QAAQ,EACR,WAAW,EACX,cAAc,EACd,YAAY,GAAG;UACX,cAAc,CAAC,CAAC,CAAC,iGAAiG,CAAC,CAAC,CAAC,EAAE;;;UAGvH,cAAc,CAAC,CAAC,CAAC,kNAAkN,CAAC,CAAC,CAAC,EAAE;0BACxN,IAAI,IAAI,EAAE,CAAC,kBAAkB,EAAE,EAAE,GACtD,GAAG,OAAO,CAAC;QAEZ,IAAI,CAAC,QAAQ;YACX,MAAM,IAAI,4CAA6B,CACrC,oCAAoC,CACrC,CAAC;QACJ,IAAI,CAAC,WAAW,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC;YAC1C,MAAM,IAAI,4CAA6B,CACrC,0CAA0C,CAC3C,CAAC;QAEJ,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAEnC,MAAM,aAAa,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YAC5C,IAAI,EAAE,WAAoB;YAC1B,SAAS,EAAE,EAAE,GAAG,EAAE,0BAA0B,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,EAAE;SACrE,CAAC,CAAC,CAAC;QAEJ,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,oBAAoB,CAEnD;YACA,MAAM,EAAE,IAAI,CAAC,YAAY;YACzB,OAAO,EAAE;gBACP,QAAQ,EAAE;oBACR,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,EAAE;oBACzC;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE;4BACP;gCACE,IAAI,EAAE,MAAM;gCACZ,IAAI,EAAE,cAAc;oCAClB,CAAC,CAAC,aAAa,QAAQ,2DAA2D,cAAc,qBAAqB,WAAW,CAAC,MAAM,sKAAsK;oCAC7S,CAAC,CAAC,GAAG,QAAQ,qBAAqB,WAAW,CAAC,MAAM,mIAAmI;6BAC1L;4BACD,GAAG,aAAa;yBACjB;qBACF;iBACF;gBACD,cAAc,EAAE,EAAE,IAAI,EAAE,kBAAkB,EAAE,MAAM,EAAE,gBAAgB,EAAE;aACvE;SACF,CAAC,CAAC;QAEH,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,QAAQ,CAAC,IAEvB,CAAC;YACF,OAAO,EAAE,UAAU,EAAE,MAAM,CAAC,UAAU,EAAE,SAAS,EAAE,MAAM,CAAC,SAAS,EAAE,CAAC;QACxE,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAChB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YACzD,OAAO;gBACL,UAAU,EAAE,SAAS;gBACrB,SAAS,EAAE,sCAAsC,YAAY,EAAE;aACvD,CAAC;QACb,CAAC;IACH,CAAC;CACF;AA5QD,8CA4QC","sourcesContent":["/**\n * Legacy V3 evaluator implementation.\n *\n * This is the behavior-preserving implementation that backs V3Evaluator when\n * STAGEHAND_EVALUATOR_BACKEND=legacy.\n */\n\nimport { z } from \"zod\";\nimport type { AvailableModel, ClientOptions } from \"./v3/types/public/model.js\";\nimport type {\n EvaluateOptions,\n BatchAskOptions,\n EvaluationResult,\n} from \"./v3/types/private/evaluator.js\";\nimport { LLMParsedResponse } from \"./inference.js\";\nimport { LLMResponse, LLMClient } from \"./v3/llm/LLMClient.js\";\nimport { LogLine } from \"./v3/types/public/logs.js\";\nimport { V3 } from \"./v3/v3.js\";\nimport { LLMProvider } from \"./v3/llm/LLMProvider.js\";\nimport { StagehandInvalidArgumentError } from \"./v3/types/public/sdkErrors.js\";\n\nconst EvaluationSchema = z.object({\n evaluation: z.enum([\"YES\", \"NO\"]),\n reasoning: z.string(),\n});\n\nconst BatchEvaluationSchema = z.array(EvaluationSchema);\n\nexport class LegacyV3Evaluator {\n private v3: V3;\n private modelName: AvailableModel;\n private modelClientOptions: ClientOptions | { apiKey: string };\n private silentLogger: (message: LogLine) => void = () => {};\n\n constructor(\n v3: V3,\n modelName?: AvailableModel,\n modelClientOptions?: ClientOptions,\n ) {\n this.v3 = v3;\n this.modelName = modelName || (\"google/gemini-2.5-flash\" as AvailableModel);\n this.modelClientOptions = modelClientOptions || {\n apiKey:\n process.env.GEMINI_API_KEY ||\n process.env.GOOGLE_GENERATIVE_AI_API_KEY ||\n \"\",\n };\n }\n\n private getClient(): LLMClient {\n // Prefer a dedicated provider so we can override model per-evaluation\n const provider = new LLMProvider(this.v3.logger);\n return provider.getClient(this.modelName, this.modelClientOptions);\n }\n\n async ask(options: EvaluateOptions): Promise<EvaluationResult> {\n const {\n question,\n answer,\n screenshot = true,\n systemPrompt,\n screenshotDelayMs = 250,\n agentReasoning,\n } = options;\n if (!question)\n throw new StagehandInvalidArgumentError(\n \"Question cannot be an empty string\",\n );\n if (!answer && !screenshot)\n throw new StagehandInvalidArgumentError(\n \"Either answer (text) or screenshot must be provided\",\n );\n\n if (Array.isArray(screenshot)) {\n return this._evaluateWithMultipleScreenshots({\n question,\n screenshots: screenshot,\n systemPrompt,\n agentReasoning,\n });\n }\n\n const defaultSystemPrompt = `You are an expert evaluator that confidently returns YES or NO based on if the original goal was achieved. You have access to ${screenshot ? \"a screenshot\" : \"the agents reasoning and actions throughout the task\"} that you can use to evaluate the tasks completion. Provide detailed reasoning for your answer.\\n Today's date is ${new Date().toLocaleDateString()}`;\n\n await new Promise((r) => setTimeout(r, screenshotDelayMs));\n let imageBuffer: Buffer | undefined;\n if (screenshot) {\n const page = await this.v3.context.awaitActivePage();\n imageBuffer = await page.screenshot({ fullPage: false });\n }\n\n const llmClient = this.getClient();\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n { role: \"system\", content: systemPrompt || defaultSystemPrompt },\n {\n role: \"user\",\n content: [\n {\n type: \"text\",\n text: agentReasoning\n ? `Question: ${question}\\n\\nAgent's reasoning and actions taken:\\n${agentReasoning}`\n : question,\n },\n ...(screenshot && imageBuffer\n ? [\n {\n type: \"image_url\" as const,\n image_url: {\n url: `data:image/jpeg;base64,${imageBuffer.toString(\"base64\")}`,\n },\n },\n ]\n : []),\n ...(answer\n ? [{ type: \"text\" as const, text: `the answer is ${answer}` }]\n : []),\n ],\n },\n ],\n response_model: { name: \"EvaluationResult\", schema: EvaluationSchema },\n },\n });\n\n try {\n const result = response.data as unknown as z.infer<\n typeof EvaluationSchema\n >;\n return { evaluation: result.evaluation, reasoning: result.reasoning };\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return {\n evaluation: \"INVALID\",\n reasoning: `Failed to get structured response: ${errorMessage}`,\n } as const;\n }\n }\n\n async batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]> {\n const {\n questions,\n screenshot = true,\n systemPrompt = \"You are an expert evaluator that returns YES or NO with a concise reasoning.\",\n screenshotDelayMs = 250,\n } = options;\n if (!questions?.length)\n throw new StagehandInvalidArgumentError(\n \"Questions array cannot be empty\",\n );\n\n await new Promise((r) => setTimeout(r, screenshotDelayMs));\n let imageBuffer: Buffer | undefined;\n if (screenshot) {\n const page = await this.v3.context.awaitActivePage();\n imageBuffer = await page.screenshot({ fullPage: false });\n }\n\n const llmClient = this.getClient();\n\n const formatted = questions\n .map(\n (item, i) =>\n `${i + 1}. ${item.question}${item.answer ? `\\n Answer: ${item.answer}` : \"\"}`,\n )\n .join(\"\\n\\n\");\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n {\n role: \"system\",\n content: `${systemPrompt}\\n\\nYou will be given multiple questions${screenshot ? \" with a screenshot\" : \"\"}. ${questions.some((q) => q.answer) ? \"Some questions include answers to evaluate.\" : \"\"} Answer each question by returning an object in the specified JSON format. Return a single JSON array containing one object for each question in the order they were asked.`,\n },\n {\n role: \"user\",\n content: [\n { type: \"text\", text: formatted },\n ...(screenshot && imageBuffer\n ? [\n {\n type: \"image_url\" as const,\n image_url: {\n url: `data:image/jpeg;base64,${imageBuffer.toString(\"base64\")}`,\n },\n },\n ]\n : []),\n ],\n },\n ],\n response_model: {\n name: \"BatchEvaluationResult\",\n schema: BatchEvaluationSchema,\n },\n },\n });\n\n try {\n const results = response.data as unknown as z.infer<\n typeof BatchEvaluationSchema\n >;\n return results.map((r) => ({\n evaluation: r.evaluation,\n reasoning: r.reasoning,\n }));\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return questions.map(() => ({\n evaluation: \"INVALID\" as const,\n reasoning: `Failed to get structured response: ${errorMessage}`,\n }));\n }\n }\n\n private async _evaluateWithMultipleScreenshots(options: {\n question: string;\n screenshots: Buffer[];\n systemPrompt?: string;\n agentReasoning?: string;\n }): Promise<EvaluationResult> {\n const {\n question,\n screenshots,\n agentReasoning,\n systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.\n ${agentReasoning ? \"You also have access to the agent's detailed reasoning and thought process throughout the task.\" : \"\"}\n Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.\n Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).\n ${agentReasoning ? \"The agent's reasoning provides crucial context about what actions were attempted, what was observed, and the decision-making process. Use this alongside the visual evidence to make a comprehensive evaluation.\" : \"\"}\n Today's date is ${new Date().toLocaleDateString()}`,\n } = options;\n\n if (!question)\n throw new StagehandInvalidArgumentError(\n \"Question cannot be an empty string\",\n );\n if (!screenshots || screenshots.length === 0)\n throw new StagehandInvalidArgumentError(\n \"At least one screenshot must be provided\",\n );\n\n const llmClient = this.getClient();\n\n const imageContents = screenshots.map((s) => ({\n type: \"image_url\" as const,\n image_url: { url: `data:image/jpeg;base64,${s.toString(\"base64\")}` },\n }));\n\n const response = await llmClient.createChatCompletion<\n LLMParsedResponse<LLMResponse>\n >({\n logger: this.silentLogger,\n options: {\n messages: [\n { role: \"system\", content: systemPrompt },\n {\n role: \"user\",\n content: [\n {\n type: \"text\",\n text: agentReasoning\n ? `Question: ${question}\\n\\nAgent's reasoning and actions throughout the task:\\n${agentReasoning}\\n\\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`\n : `${question}\\n\\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,\n },\n ...imageContents,\n ],\n },\n ],\n response_model: { name: \"EvaluationResult\", schema: EvaluationSchema },\n },\n });\n\n try {\n const result = response.data as unknown as z.infer<\n typeof EvaluationSchema\n >;\n return { evaluation: result.evaluation, reasoning: result.reasoning };\n } catch (error) {\n const errorMessage =\n error instanceof Error ? error.message : String(error);\n return {\n evaluation: \"INVALID\",\n reasoning: `Failed to get structured response: ${errorMessage}`,\n } as const;\n }\n }\n}\n"]}
@@ -3,7 +3,7 @@ import { LLMClient } from "./v3/llm/LLMClient.js";
3
3
  import type { InferStagehandSchema, StagehandZodObject } from "./v3/zodCompat.js";
4
4
  import type { Variables } from "./v3/types/public/agent.js";
5
5
  export type { LLMParsedResponse, LLMUsage } from "./v3/llm/LLMClient.js";
6
- export declare function extract<T extends StagehandZodObject>({ instruction, domElements, schema, llmClient, logger, userProvidedInstructions, logInferenceToFile, }: {
6
+ export declare function extract<T extends StagehandZodObject>({ instruction, domElements, schema, llmClient, logger, userProvidedInstructions, logInferenceToFile, screenshot, }: {
7
7
  instruction: string;
8
8
  domElements: string;
9
9
  schema: T;
@@ -11,6 +11,7 @@ export declare function extract<T extends StagehandZodObject>({ instruction, dom
11
11
  userProvidedInstructions?: string;
12
12
  logger: (message: LogLine) => void;
13
13
  logInferenceToFile?: boolean;
14
+ screenshot?: Buffer;
14
15
  }): Promise<InferStagehandSchema<T> & {
15
16
  metadata: {
16
17
  completed: boolean;