npm - @browserbasehq/orca - Versions diffs - 3.5.0-preview.0 → 3.5.0-preview.1 - Mend

@browserbasehq/orca 3.5.0-preview.0 → 3.5.0-preview.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

package/dist/cjs/lib/v3/verifier/types.d.ts ADDED Viewed

@@ -0,0 +1,281 @@
+/**
+ * Shared verifier types for trajectories, rubrics, evidence, and results.
+ *
+ * The verifier consumes saved trajectories instead of a live browser. DOM and
+ * Hybrid runs preserve tool-return text/JSON evidence, while CUA runs preserve
+ * screenshots sent to the provider plus independent harness probes.
+ */
+/** Token usage for one or more LLM calls. Matches AgentResult.usage shape. */
+export interface TrajectoryUsage {
+    input_tokens: number;
+    output_tokens: number;
+    reasoning_tokens?: number;
+    cached_input_tokens?: number;
+    inference_time_ms?: number;
+}
+/** A single criterion in a Stagehand rubric. */
+export interface RubricCriterion {
+    /** Short name of the criterion (e.g., "Add ground beef to cart"). */
+    criterion: string;
+    /** What to evaluate and how to award partial credit. */
+    description: string;
+    /** Maximum points for this criterion. */
+    maxPoints: number;
+    /**
+     * Applicability rule for situational criteria. When this condition is not
+     * met, the criterion is excluded from scoring rather than counted as failed.
+     */
+    condition?: string;
+}
+/** A rubric — list of criteria for a task. */
+export interface Rubric {
+    items: RubricCriterion[];
+}
+/**
+ * Spec for a single task being verified. Carried both at runtime and into the
+ * verifier alongside the trajectory.
+ */
+export interface TaskSpec {
+    /** Stable identifier (e.g., "united_13" for WebTailBench, task_id for Mind2Web). */
+    id: string;
+    /** Task instruction shown to the agent. */
+    instruction: string;
+    /** Starting URL, if any. */
+    initUrl?: string;
+    /** Rubric carried by the dataset or generated by a verifier backend. */
+    precomputedRubric?: Rubric;
+    /** Optional reference answer (set when dataset ships one). */
+    expectedAnswer?: string;
+}
+/**
+ * A single modality unit in tier-1 agent evidence. Mirrors the shape of
+ * ModelMessage content parts so we can reproduce what the LLM ingested.
+ */
+export type AgentEvidenceModality = {
+    type: "text";
+    content: string;
+} | {
+    type: "image";
+    bytes: Buffer;
+    mediaType: string;
+} | {
+    type: "json";
+    content: unknown;
+};
+/**
+ * Tier 1 — exactly the bytes/strings/objects the agent's LLM ingested as the
+ * tool result for this step.
+ *
+ * Modes:
+ *   - CUA: usually a single image modality (the screenshot sent to the provider).
+ *   - Hybrid: tool result with optional screenshotBase64 → one image + one text.
+ *   - DOM: tool returns (extract JSON, ariaTree text, etc.) → text/json modalities.
+ */
+export interface AgentEvidence {
+    modalities: AgentEvidenceModality[];
+}
+/**
+ * Tier 2 — independent harness probes around this step.
+ *
+ * If a probe wasn't captured, the field is absent (not null).
+ */
+export interface ProbeEvidence {
+    /** URL after the step's tool execution. */
+    url?: string;
+    /**
+     * Bus screenshot captured after the step. Path on disk is preferred once
+     * persisted; in-memory Buffer is used during a live run.
+     */
+    screenshot?: Buffer;
+    /** Reference to the persisted screenshot file under the trajectory dir. */
+    screenshotPath?: string;
+    /** Accessibility tree snapshot. */
+    ariaTree?: string;
+    /** Verifier-requested probes, keyed by criterion id. */
+    onDemand?: Record<string, unknown>;
+}
+/** Outcome of a single tool execution as seen by the harness. */
+export interface ToolOutput {
+    ok: boolean;
+    /**
+     * The tool's return value. Same payload that flowed into agentEvidence
+     * modalities, but in its native shape (e.g., the extract result, the act
+     * describe-string) rather than serialized for the LLM.
+     */
+    result: unknown;
+    error?: string;
+}
+/** One step in a trajectory: action + reasoning + evidence + outcome. */
+export interface TrajectoryStep {
+    actionName: string;
+    actionArgs: Record<string, unknown>;
+    /** From AgentAction.reasoning. May be empty for tools that don't surface reasoning. */
+    reasoning: string;
+    agentEvidence: AgentEvidence;
+    probeEvidence: ProbeEvidence;
+    toolOutput: ToolOutput;
+}
+/** Terminal status of the agent run. */
+export type TrajectoryStatus = "complete" | "aborted" | "stalled" | "error";
+/**
+ * Full trajectory for one task run.
+ *
+ * The on-disk layout is one directory per task:
+ *
+ *   .trajectories/<run-id>/<task-id>/
+ *     ├── task_data.json    — TaskSpec + result metadata
+ *     ├── trajectory.json   — this object, with screenshotPath instead of bytes
+ *     ├── screenshots/      — step probe/agent images plus final observation
+ *     ├── scores/
+ *     │   └── result.json       — Result from V3Evaluator.verify()
+ *     └── core.log          — captured action log
+ */
+export interface Trajectory {
+    task: TaskSpec;
+    steps: TrajectoryStep[];
+    finalAnswer?: string;
+    /** Terminal page observation captured after the agent finishes. */
+    finalObservation?: ProbeEvidence;
+    status: TrajectoryStatus;
+    usage: TrajectoryUsage;
+}
+/** Score for a single rubric criterion after evidence analysis + rescoring. */
+export interface CriterionScore {
+    /** Matches RubricCriterion.criterion (the criterion's short name). */
+    criterion: string;
+    /** Maximum possible points for this criterion. */
+    maxPoints: number;
+    /**
+     * Points earned post-evidence-analysis (paper's post_image_earned_points).
+     * Null if the criterion was conditional and its condition wasn't met (excluded
+     * from both numerator and denominator in the process score).
+     */
+    earnedPoints: number | null;
+    /** Verifier's explanation for the score. */
+    explanation: string;
+    /**
+     * True if the criterion is conditional and its condition was determined to
+     * be met. Absent for non-conditional criteria.
+     */
+    conditionMet?: boolean;
+    /**
+     * Set when the verifier had no evidence to ground this criterion in either
+     * tier. Per paper §2, treated as uncontrollable failure → full credit, but
+     * surfaced here so dashboards can flag low-confidence results.
+     */
+    evidenceInsufficient?: boolean;
+}
+/**
+ * First-point-of-failure analysis (paper Step 9a). Identifies the earliest
+ * step where the agent's trajectory went off-track, using a structured error
+ * taxonomy (7 top-level categories, 1.1–7.4 sub-codes).
+ */
+export interface FirstPointOfFailure {
+    stepIndex: number;
+    /** Sub-code from the error taxonomy (e.g., "2.3" for a specific hallucination type). */
+    errorCode: string;
+    /** Top-level category name (Selection, Hallucination, etc.). */
+    category: string;
+    /** Verifier's reasoning for selecting this point. */
+    description?: string;
+}
+/**
+ * Structured observation surfaced by the verifier that another agent or
+ * tooling could act on. Findings are emitted opportunistically by Step 8
+ * (outcome verification) when the verifier notices actionable patterns —
+ * repeated tool-call failures, ambiguous task specs, evidence gaps, etc.
+ *
+ * Not produced for every task: when nothing actionable surfaces, the
+ * `findings` array on the EvaluationResult is empty. Consumers should treat the
+ * field as advisory, not as part of the formal score.
+ */
+export interface VerifierFinding {
+    /**
+     * Category of the observation. Open-ended enum — additional categories may
+     * be added as verifier backends surface new failure modes.
+     */
+    category: "agent_tool_usage" | "agent_strategy" | "rubric_quality" | "trajectory_capture" | "task_specification" | "verifier_uncertainty" | "other";
+    /** Impact: info (FYI), warning (worth investigating), blocking (broke the task). */
+    severity: "info" | "warning" | "blocking";
+    /** What the verifier noticed. Plain prose, grounded in evidence from the trajectory. */
+    description: string;
+    /**
+     * Optional concrete next action another agent could take. Should be
+     * specific enough that it can be acted on without further reasoning —
+     * e.g., "Try double_click instead of triple_click to clear placeholder
+     * text on this form field."
+     */
+    suggestedAction?: string;
+    /** Step indices in the trajectory where this pattern showed up. */
+    relatedSteps?: number[];
+}
+/** Stable debugging summary emitted by verifier backends. */
+export interface VerifierRawSteps {
+    backend?: "legacy" | "verifier";
+    primaryIntent?: string;
+    reasoning?: string;
+    rubricSource?: "precomputed" | "generated" | "none";
+    approach?: "a" | "b";
+    optionalsMode?: "folded" | "separate" | "skip";
+    totalEarned?: number;
+    totalMax?: number;
+    evidenceImages?: number;
+    evidenceTexts?: number;
+    evidenceOriginalScreenshots?: number;
+    legacyEvaluation?: string;
+    screenshotCount?: number;
+}
+/** Task-validity classification (paper Step 10). */
+export interface TaskValidity {
+    /** True if the task is underspecified / has multiple valid interpretations. */
+    isAmbiguous: boolean;
+    /** True if the task is impossible / illegal / NSFW / otherwise infeasible. */
+    isInvalid: boolean;
+    /** Optional sub-codes from the task-classification taxonomy. */
+    ambiguityCodes?: string[];
+    invalidTaskCodes?: string[];
+}
+/**
+ * Evaluator output. Legacy evaluation may only populate outcome fields; richer
+ * verifier backends can also populate process scoring and diagnostics.
+ *
+ * Process and outcome are deliberately independent when both are present:
+ * an agent can follow the right steps but get blocked (high process, low
+ * outcome), or succeed through an unexpected path (variable process, high
+ * outcome).
+ */
+export interface EvaluationResult {
+    /** Did the agent accomplish the task from the user's perspective? */
+    outcomeSuccess: boolean;
+    /** Human-readable explanation for the outcome. */
+    explanation?: string;
+    /** Aggregated earned/max across applicable criteria, in [0, 1]. */
+    processScore?: number;
+    /** Per-criterion breakdown after rescoring. */
+    perCriterion?: CriterionScore[];
+    /** Step 9a — first step where the trajectory went off-track, if any. */
+    firstPointOfFailure?: FirstPointOfFailure;
+    /** Step 10 — task-itself ambiguity / validity. */
+    taskValidity?: TaskValidity;
+    /**
+     * Ids (RubricCriterion.criterion strings) of criteria where neither tier of
+     * evidence resolved the question. Treated as uncontrollable → full credit,
+     * but flagged here so consumers can decide whether to discount the score.
+     */
+    evidenceInsufficient?: string[];
+    /**
+     * Structured observations from the verifier that a downstream tool or
+     * follow-up agent could act on. Opportunistic — empty when the verifier
+     * doesn't notice anything actionable. Not part of the score; advisory.
+     */
+    findings?: VerifierFinding[];
+    /** Debugging summary from the active evaluator backend. */
+    rawSteps?: VerifierRawSteps;
+}
+/**
+ * Verifier interface. Implementations consume a Trajectory and return an
+ * EvaluationResult — they MUST NOT touch a live browser.
+ */
+export interface Verifier {
+    verify(trajectory: Trajectory): Promise<EvaluationResult>;
+}

package/dist/cjs/lib/v3/verifier/types.js ADDED Viewed

@@ -0,0 +1,10 @@
+"use strict";
+/**
+ * Shared verifier types for trajectories, rubrics, evidence, and results.
+ *
+ * The verifier consumes saved trajectories instead of a live browser. DOM and
+ * Hybrid runs preserve tool-return text/JSON evidence, while CUA runs preserve
+ * screenshots sent to the provider plus independent harness probes.
+ */
+Object.defineProperty(exports, "__esModule", { value: true });
+//# sourceMappingURL=types.js.map

package/dist/cjs/lib/v3/verifier/types.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../../../../lib/v3/verifier/types.ts"],"names":[],"mappings":";AAAA;;;;;;GAMG","sourcesContent":["/**\n * Shared verifier types for trajectories, rubrics, evidence, and results.\n *\n * The verifier consumes saved trajectories instead of a live browser. DOM and\n * Hybrid runs preserve tool-return text/JSON evidence, while CUA runs preserve\n * screenshots sent to the provider plus independent harness probes.\n */\n\n/** Token usage for one or more LLM calls. Matches AgentResult.usage shape. */\nexport interface TrajectoryUsage {\n input_tokens: number;\n output_tokens: number;\n reasoning_tokens?: number;\n cached_input_tokens?: number;\n inference_time_ms?: number;\n}\n\n/** A single criterion in a Stagehand rubric. */\nexport interface RubricCriterion {\n /** Short name of the criterion (e.g., \"Add ground beef to cart\"). */\n criterion: string;\n /** What to evaluate and how to award partial credit. */\n description: string;\n /** Maximum points for this criterion. */\n maxPoints: number;\n /**\n * Applicability rule for situational criteria. When this condition is not\n * met, the criterion is excluded from scoring rather than counted as failed.\n */\n condition?: string;\n}\n\n/** A rubric — list of criteria for a task. */\nexport interface Rubric {\n items: RubricCriterion[];\n}\n\n/**\n * Spec for a single task being verified. Carried both at runtime and into the\n * verifier alongside the trajectory.\n */\nexport interface TaskSpec {\n /** Stable identifier (e.g., \"united_13\" for WebTailBench, task_id for Mind2Web). */\n id: string;\n /** Task instruction shown to the agent. */\n instruction: string;\n /** Starting URL, if any. */\n initUrl?: string;\n /** Rubric carried by the dataset or generated by a verifier backend. */\n precomputedRubric?: Rubric;\n /** Optional reference answer (set when dataset ships one). */\n expectedAnswer?: string;\n}\n\n/**\n * A single modality unit in tier-1 agent evidence. Mirrors the shape of\n * ModelMessage content parts so we can reproduce what the LLM ingested.\n */\nexport type AgentEvidenceModality =\n | { type: \"text\"; content: string }\n | { type: \"image\"; bytes: Buffer; mediaType: string }\n | { type: \"json\"; content: unknown };\n\n/**\n * Tier 1 — exactly the bytes/strings/objects the agent's LLM ingested as the\n * tool result for this step.\n *\n * Modes:\n * - CUA: usually a single image modality (the screenshot sent to the provider).\n * - Hybrid: tool result with optional screenshotBase64 → one image + one text.\n * - DOM: tool returns (extract JSON, ariaTree text, etc.) → text/json modalities.\n */\nexport interface AgentEvidence {\n modalities: AgentEvidenceModality[];\n}\n\n/**\n * Tier 2 — independent harness probes around this step.\n *\n * If a probe wasn't captured, the field is absent (not null).\n */\nexport interface ProbeEvidence {\n /** URL after the step's tool execution. */\n url?: string;\n /**\n * Bus screenshot captured after the step. Path on disk is preferred once\n * persisted; in-memory Buffer is used during a live run.\n */\n screenshot?: Buffer;\n /** Reference to the persisted screenshot file under the trajectory dir. */\n screenshotPath?: string;\n /** Accessibility tree snapshot. */\n ariaTree?: string;\n /** Verifier-requested probes, keyed by criterion id. */\n onDemand?: Record<string, unknown>;\n}\n\n/** Outcome of a single tool execution as seen by the harness. */\nexport interface ToolOutput {\n ok: boolean;\n /**\n * The tool's return value. Same payload that flowed into agentEvidence\n * modalities, but in its native shape (e.g., the extract result, the act\n * describe-string) rather than serialized for the LLM.\n */\n result: unknown;\n error?: string;\n}\n\n/** One step in a trajectory: action + reasoning + evidence + outcome. */\nexport interface TrajectoryStep {\n actionName: string;\n actionArgs: Record<string, unknown>;\n /** From AgentAction.reasoning. May be empty for tools that don't surface reasoning. */\n reasoning: string;\n agentEvidence: AgentEvidence;\n probeEvidence: ProbeEvidence;\n toolOutput: ToolOutput;\n}\n\n/** Terminal status of the agent run. */\nexport type TrajectoryStatus = \"complete\" | \"aborted\" | \"stalled\" | \"error\";\n\n/**\n * Full trajectory for one task run.\n *\n * The on-disk layout is one directory per task:\n *\n * .trajectories/<run-id>/<task-id>/\n * ├── task_data.json — TaskSpec + result metadata\n * ├── trajectory.json — this object, with screenshotPath instead of bytes\n * ├── screenshots/ — step probe/agent images plus final observation\n * ├── scores/\n * │ └── result.json — Result from V3Evaluator.verify()\n * └── core.log — captured action log\n */\nexport interface Trajectory {\n task: TaskSpec;\n steps: TrajectoryStep[];\n finalAnswer?: string;\n /** Terminal page observation captured after the agent finishes. */\n finalObservation?: ProbeEvidence;\n status: TrajectoryStatus;\n usage: TrajectoryUsage;\n}\n\n/** Score for a single rubric criterion after evidence analysis + rescoring. */\nexport interface CriterionScore {\n /** Matches RubricCriterion.criterion (the criterion's short name). */\n criterion: string;\n /** Maximum possible points for this criterion. */\n maxPoints: number;\n /**\n * Points earned post-evidence-analysis (paper's post_image_earned_points).\n * Null if the criterion was conditional and its condition wasn't met (excluded\n * from both numerator and denominator in the process score).\n */\n earnedPoints: number | null;\n /** Verifier's explanation for the score. */\n explanation: string;\n /**\n * True if the criterion is conditional and its condition was determined to\n * be met. Absent for non-conditional criteria.\n */\n conditionMet?: boolean;\n /**\n * Set when the verifier had no evidence to ground this criterion in either\n * tier. Per paper §2, treated as uncontrollable failure → full credit, but\n * surfaced here so dashboards can flag low-confidence results.\n */\n evidenceInsufficient?: boolean;\n}\n\n/**\n * First-point-of-failure analysis (paper Step 9a). Identifies the earliest\n * step where the agent's trajectory went off-track, using a structured error\n * taxonomy (7 top-level categories, 1.1–7.4 sub-codes).\n */\nexport interface FirstPointOfFailure {\n stepIndex: number;\n /** Sub-code from the error taxonomy (e.g., \"2.3\" for a specific hallucination type). */\n errorCode: string;\n /** Top-level category name (Selection, Hallucination, etc.). */\n category: string;\n /** Verifier's reasoning for selecting this point. */\n description?: string;\n}\n\n/**\n * Structured observation surfaced by the verifier that another agent or\n * tooling could act on. Findings are emitted opportunistically by Step 8\n * (outcome verification) when the verifier notices actionable patterns —\n * repeated tool-call failures, ambiguous task specs, evidence gaps, etc.\n *\n * Not produced for every task: when nothing actionable surfaces, the\n * `findings` array on the EvaluationResult is empty. Consumers should treat the\n * field as advisory, not as part of the formal score.\n */\nexport interface VerifierFinding {\n /**\n * Category of the observation. Open-ended enum — additional categories may\n * be added as verifier backends surface new failure modes.\n */\n category:\n | \"agent_tool_usage\" // agent's tool calls had repeated issues (misclicks, wrong args, retries)\n | \"agent_strategy\" // higher-level planning / decision-making problems\n | \"rubric_quality\" // criteria were overly strict, ambiguous, or contradictory\n | \"trajectory_capture\" // gaps in evidence (missing screenshots, empty steps)\n | \"task_specification\" // task instruction was ambiguous / under- or over-specified\n | \"verifier_uncertainty\" // verifier itself couldn't confidently decide\n | \"other\";\n /** Impact: info (FYI), warning (worth investigating), blocking (broke the task). */\n severity: \"info\" | \"warning\" | \"blocking\";\n /** What the verifier noticed. Plain prose, grounded in evidence from the trajectory. */\n description: string;\n /**\n * Optional concrete next action another agent could take. Should be\n * specific enough that it can be acted on without further reasoning —\n * e.g., \"Try double_click instead of triple_click to clear placeholder\n * text on this form field.\"\n */\n suggestedAction?: string;\n /** Step indices in the trajectory where this pattern showed up. */\n relatedSteps?: number[];\n}\n\n/** Stable debugging summary emitted by verifier backends. */\nexport interface VerifierRawSteps {\n backend?: \"legacy\" | \"verifier\";\n primaryIntent?: string;\n reasoning?: string;\n rubricSource?: \"precomputed\" | \"generated\" | \"none\";\n approach?: \"a\" | \"b\";\n optionalsMode?: \"folded\" | \"separate\" | \"skip\";\n totalEarned?: number;\n totalMax?: number;\n evidenceImages?: number;\n evidenceTexts?: number;\n evidenceOriginalScreenshots?: number;\n legacyEvaluation?: string;\n screenshotCount?: number;\n}\n\n/** Task-validity classification (paper Step 10). */\nexport interface TaskValidity {\n /** True if the task is underspecified / has multiple valid interpretations. */\n isAmbiguous: boolean;\n /** True if the task is impossible / illegal / NSFW / otherwise infeasible. */\n isInvalid: boolean;\n /** Optional sub-codes from the task-classification taxonomy. */\n ambiguityCodes?: string[];\n invalidTaskCodes?: string[];\n}\n\n/**\n * Evaluator output. Legacy evaluation may only populate outcome fields; richer\n * verifier backends can also populate process scoring and diagnostics.\n *\n * Process and outcome are deliberately independent when both are present:\n * an agent can follow the right steps but get blocked (high process, low\n * outcome), or succeed through an unexpected path (variable process, high\n * outcome).\n */\nexport interface EvaluationResult {\n /** Did the agent accomplish the task from the user's perspective? */\n outcomeSuccess: boolean;\n /** Human-readable explanation for the outcome. */\n explanation?: string;\n /** Aggregated earned/max across applicable criteria, in [0, 1]. */\n processScore?: number;\n /** Per-criterion breakdown after rescoring. */\n perCriterion?: CriterionScore[];\n /** Step 9a — first step where the trajectory went off-track, if any. */\n firstPointOfFailure?: FirstPointOfFailure;\n /** Step 10 — task-itself ambiguity / validity. */\n taskValidity?: TaskValidity;\n /**\n * Ids (RubricCriterion.criterion strings) of criteria where neither tier of\n * evidence resolved the question. Treated as uncontrollable → full credit,\n * but flagged here so consumers can decide whether to discount the score.\n */\n evidenceInsufficient?: string[];\n /**\n * Structured observations from the verifier that a downstream tool or\n * follow-up agent could act on. Opportunistic — empty when the verifier\n * doesn't notice anything actionable. Not part of the score; advisory.\n */\n findings?: VerifierFinding[];\n /** Debugging summary from the active evaluator backend. */\n rawSteps?: VerifierRawSteps;\n}\n\n/**\n * Verifier interface. Implementations consume a Trajectory and return an\n * EvaluationResult — they MUST NOT touch a live browser.\n */\nexport interface Verifier {\n verify(trajectory: Trajectory): Promise<EvaluationResult>;\n}\n"]}

package/dist/cjs/lib/v3Evaluator.d.ts CHANGED Viewed

@@ -1,6 +1,7 @@
 import type { AvailableModel, ClientOptions } from "./v3/types/public/model.js";
-import type { EvaluateOptions, BatchAskOptions, EvaluationResult } from "./v3/types/private/evaluator.js";
+import type { EvaluateOptions, BatchAskOptions, EvaluationResult as LegacyEvaluationResult } from "./v3/types/private/evaluator.js";
 import { V3 } from "./v3/v3.js";
+import type { Trajectory, TaskSpec, EvaluationResult, Rubric, Verifier } from "./v3/verifier/index.js";
 export type V3EvaluatorBackend = "legacy" | "verifier";
 export type V3EvaluatorOptions = {
     /**
@@ -17,11 +18,15 @@ export type V3EvaluatorConstructorOptions = V3EvaluatorOptions & {
     modelName?: AvailableModel;
     modelClientOptions?: ClientOptions;
 };
-export declare class V3Evaluator {
+export declare class V3Evaluator implements Verifier {
     private readonly backend;
     private readonly legacyEvaluator;
     constructor(v3: V3, modelNameOrOptions?: AvailableModel | V3EvaluatorConstructorOptions, modelClientOptions?: ClientOptions, options?: V3EvaluatorOptions);
-    ask(options: EvaluateOptions): Promise<EvaluationResult>;
-    batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]>;
+    ask(options: EvaluateOptions): Promise<LegacyEvaluationResult>;
+    batchAsk(options: BatchAskOptions): Promise<LegacyEvaluationResult[]>;
+    verify(trajectory: Trajectory): Promise<EvaluationResult>;
+    generateRubric(taskSpec: TaskSpec): Promise<Rubric>;
     private getLegacyBackend;
+    private unavailableVerifierBackend;
+    private verifyTrajectoryWithLegacyEvaluator;
 }

package/dist/cjs/lib/v3Evaluator.js CHANGED Viewed

@@ -19,12 +19,48 @@ class V3Evaluator {
     async batchAsk(options) {
         return this.getLegacyBackend("batchAsk").batchAsk(options);
     }
+    async verify(trajectory) {
+        const taskSpec = assertVerifierInput(trajectory);
+        if (this.backend === "legacy") {
+            return this.verifyTrajectoryWithLegacyEvaluator(trajectory, taskSpec);
+        }
+        return this.unavailableVerifierBackend("verify");
+    }
+    async generateRubric(taskSpec) {
+        if (!taskSpec?.id) {
+            throw new sdkErrors_js_1.StagehandInvalidArgumentError("TaskSpec.id is required for rubric generation");
+        }
+        if (this.backend === "verifier") {
+            return this.unavailableVerifierBackend("generateRubric");
+        }
+        return {
+            items: [legacyTaskCompletionCriterion(taskSpec)],
+        };
+    }
     getLegacyBackend(methodName) {
         if (this.backend === "legacy") {
             return this.legacyEvaluator;
         }
+        return this.unavailableVerifierBackend(methodName);
+    }
+    unavailableVerifierBackend(methodName) {
         throw new sdkErrors_js_1.StagehandInvalidArgumentError(`V3Evaluator.${methodName}() was configured with ${EVALUATOR_BACKEND_ENV}=verifier, but the verifier backend is not available in this build. Use "legacy" or install the verifier backend PR.`);
     }
+    async verifyTrajectoryWithLegacyEvaluator(trajectory, taskSpec) {
+        const screenshots = collectLegacyScreenshots(trajectory);
+        const agentReasoning = renderLegacyAgentReasoning(trajectory);
+        const answer = trajectory.finalAnswer;
+        if (!screenshots.length && !answer) {
+            return legacyInsufficientEvidenceResult("Legacy evaluator compatibility mode had no screenshots or final answer to evaluate.");
+        }
+        const result = await this.legacyEvaluator.ask({
+            question: taskSpec.instruction,
+            screenshot: screenshots.length ? screenshots : false,
+            answer,
+            agentReasoning,
+        });
+        return legacyEvaluationToResult(result, screenshots.length);
+    }
 }
 exports.V3Evaluator = V3Evaluator;
 function normalizeConstructorOptions(modelNameOrOptions, modelClientOptions, options) {
@@ -53,4 +89,116 @@ function resolveEvaluatorBackend(explicitBackend) {
     }
     throw new sdkErrors_js_1.StagehandInvalidArgumentError(`Invalid ${EVALUATOR_BACKEND_ENV}="${configuredBackend}". Expected "legacy" or "verifier".`);
 }
+function assertVerifierInput(trajectory) {
+    if (!trajectory) {
+        throw new sdkErrors_js_1.StagehandInvalidArgumentError("Trajectory is required for verification");
+    }
+    if (!trajectory.task?.id) {
+        throw new sdkErrors_js_1.StagehandInvalidArgumentError("Trajectory.task.id is required for verification");
+    }
+    return trajectory.task;
+}
+function legacyTaskCompletionCriterion(taskSpec) {
+    return {
+        criterion: "legacy-task-completion",
+        description: `Evaluate whether the task was completed successfully: ${taskSpec.instruction}`,
+        maxPoints: 1,
+    };
+}
+function collectLegacyScreenshots(trajectory) {
+    const screenshots = [];
+    for (const step of trajectory.steps ?? []) {
+        if (Buffer.isBuffer(step.probeEvidence?.screenshot)) {
+            screenshots.push(step.probeEvidence.screenshot);
+            continue;
+        }
+        const agentImage = step.agentEvidence?.modalities?.find((modality) => modality.type === "image" && Buffer.isBuffer(modality.bytes));
+        if (agentImage) {
+            screenshots.push(agentImage.bytes);
+        }
+    }
+    if (Buffer.isBuffer(trajectory.finalObservation?.screenshot)) {
+        screenshots.push(trajectory.finalObservation.screenshot);
+    }
+    return screenshots;
+}
+function renderLegacyAgentReasoning(trajectory) {
+    const stepLines = (trajectory.steps ?? []).map((step, i) => {
+        const status = step.toolOutput?.ok === false ? "Tool status: failed" : "";
+        const output = step.toolOutput?.error
+            ? `Tool error: ${step.toolOutput.error}`
+            : `Tool output: ${stringifyForPrompt(step.toolOutput?.result)}`;
+        return [
+            `Step ${i}: ${step.actionName}`,
+            step.reasoning ? `Reasoning: ${step.reasoning}` : undefined,
+            status || undefined,
+            output,
+        ]
+            .filter(Boolean)
+            .join("\n");
+    });
+    if (!stepLines.length) {
+        return undefined;
+    }
+    return truncateForPrompt(`Agent trajectory:\n${stepLines.join("\n\n")}`, 16000);
+}
+function stringifyForPrompt(value) {
+    if (typeof value === "string") {
+        return value;
+    }
+    try {
+        const serialized = JSON.stringify(value);
+        return serialized ?? String(value);
+    }
+    catch {
+        return String(value);
+    }
+}
+function truncateForPrompt(value, maxLength) {
+    if (value.length <= maxLength) {
+        return value;
+    }
+    return `${value.slice(0, maxLength)}... [truncated]`;
+}
+function legacyEvaluationToResult(result, screenshotCount) {
+    const outcomeSuccess = result.evaluation === "YES";
+    const invalid = result.evaluation === "INVALID";
+    const findings = invalid
+        ? [
+            {
+                category: "verifier_uncertainty",
+                severity: "warning",
+                description: result.reasoning,
+            },
+        ]
+        : [];
+    return {
+        outcomeSuccess,
+        explanation: result.reasoning,
+        ...(findings.length ? { findings } : {}),
+        rawSteps: {
+            backend: "legacy",
+            legacyEvaluation: result.evaluation,
+            screenshotCount,
+        },
+    };
+}
+function legacyInsufficientEvidenceResult(reason) {
+    return {
+        outcomeSuccess: false,
+        explanation: reason,
+        findings: [
+            {
+                category: "trajectory_capture",
+                severity: "blocking",
+                description: reason,
+            },
+        ],
+        rawSteps: {
+            backend: "legacy",
+            legacyEvaluation: "INVALID",
+            screenshotCount: 0,
+        },
+    };
+}
 //# sourceMappingURL=v3Evaluator.js.map

package/dist/cjs/lib/v3Evaluator.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"v3Evaluator.js","sourceRoot":"","sources":["../../../lib/v3Evaluator.ts"],"names":[],"mappings":";;;AAOA,iEAA+E;AAC/E,iEAA2D;AAE3D,MAAM,qBAAqB,GAAG,6BAA6B,CAAC;AAC5D,MAAM,yBAAyB,GAAuB,QAAQ,CAAC;AA2B/D,MAAa,WAAW;IACL,OAAO,CAAqB;IAC5B,eAAe,CAAoB;IAEpD,YACE,EAAM,EACN,kBAAmE,EACnE,kBAAkC,EAClC,OAA4B;QAE5B,MAAM,iBAAiB,GAAG,2BAA2B,CACnD,kBAAkB,EAClB,kBAAkB,EAClB,OAAO,CACR,CAAC;QAEF,IAAI,CAAC,OAAO,GAAG,uBAAuB,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC;QAClE,IAAI,CAAC,eAAe,GAAG,IAAI,wCAAiB,CAC1C,EAAE,EACF,iBAAiB,CAAC,SAAS,EAC3B,iBAAiB,CAAC,kBAAkB,CACrC,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,GAAG,CAAC,OAAwB;QAChC,OAAO,IAAI,CAAC,gBAAgB,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IACnD,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,OAAwB;QACrC,OAAO,IAAI,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;IAC7D,CAAC;IAEO,gBAAgB,CAAC,UAAkB;QACzC,IAAI,IAAI,CAAC,OAAO,KAAK,QAAQ,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAC,eAAe,CAAC;QAC9B,CAAC;QAED,MAAM,IAAI,4CAA6B,CACrC,eAAe,UAAU,0BAA0B,qBAAqB,sHAAsH,CAC/L,CAAC;IACJ,CAAC;CACF;AAzCD,kCAyCC;AAED,SAAS,2BAA2B,CAClC,kBAAmE,EACnE,kBAAkC,EAClC,OAA4B;IAE5B,IACE,kBAAkB;QAClB,OAAO,kBAAkB,KAAK,QAAQ;QACtC,CAAC,KAAK,CAAC,OAAO,CAAC,kBAAkB,CAAC,EAClC,CAAC;QACD,OAAO;YACL,SAAS,EAAE,kBAAkB,CAAC,SAAS;YACvC,kBAAkB,EAAE,kBAAkB,CAAC,kBAAkB;YACzD,OAAO,EAAE,kBAAkB,CAAC,OAAO,IAAI,OAAO,EAAE,OAAO;SACxD,CAAC;IACJ,CAAC;IAED,OAAO;QACL,SAAS,EAAE,kBAAgD;QAC3D,kBAAkB;QAClB,OAAO,EAAE,OAAO,EAAE,OAAO;KAC1B,CAAC;AACJ,CAAC;AAED,SAAS,uBAAuB,CAC9B,eAAoC;IAEpC,MAAM,iBAAiB,GACrB,eAAe;QACf,OAAO,CAAC,GAAG,CAAC,qBAAqB,CAAC;QAClC,yBAAyB,CAAC;IAC5B,MAAM,iBAAiB,GAAG,iBAAiB,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAEjE,IAAI,iBAAiB,KAAK,QAAQ,IAAI,iBAAiB,KAAK,UAAU,EAAE,CAAC;QACvE,OAAO,iBAAiB,CAAC;IAC3B,CAAC;IAED,MAAM,IAAI,4CAA6B,CACrC,WAAW,qBAAqB,KAAK,iBAAiB,qCAAqC,CAC5F,CAAC;AACJ,CAAC","sourcesContent":["import type { AvailableModel, ClientOptions } from \"./v3/types/public/model.js\";\nimport type {\n EvaluateOptions,\n BatchAskOptions,\n EvaluationResult,\n} from \"./v3/types/private/evaluator.js\";\nimport { V3 } from \"./v3/v3.js\";\nimport { StagehandInvalidArgumentError } from \"./v3/types/public/sdkErrors.js\";\nimport { LegacyV3Evaluator } from \"./v3LegacyEvaluator.js\";\n\nconst EVALUATOR_BACKEND_ENV = \"STAGEHAND_EVALUATOR_BACKEND\";\nconst DEFAULT_EVALUATOR_BACKEND: V3EvaluatorBackend = \"legacy\";\n\nexport type V3EvaluatorBackend = \"legacy\" \| \"verifier\";\n\nexport type V3EvaluatorOptions = {\n /*\n Selects the evaluator implementation.\n \n \"legacy\" preserves the existing screenshot/text YES/NO evaluator.\n * \"verifier\" is reserved for the rubric verifier backend.\n \n @default process.env.STAGEHAND_EVALUATOR_BACKEND \|\| \"legacy\"\n */\n backend?: V3EvaluatorBackend;\n};\n\nexport type V3EvaluatorConstructorOptions = V3EvaluatorOptions & {\n modelName?: AvailableModel;\n modelClientOptions?: ClientOptions;\n};\n\ntype NormalizedConstructorOptions = {\n modelName?: AvailableModel;\n modelClientOptions?: ClientOptions;\n backend?: V3EvaluatorBackend;\n};\n\nexport class V3Evaluator {\n private readonly backend: V3EvaluatorBackend;\n private readonly legacyEvaluator: LegacyV3Evaluator;\n\n constructor(\n v3: V3,\n modelNameOrOptions?: AvailableModel \| V3EvaluatorConstructorOptions,\n modelClientOptions?: ClientOptions,\n options?: V3EvaluatorOptions,\n ) {\n const normalizedOptions = normalizeConstructorOptions(\n modelNameOrOptions,\n modelClientOptions,\n options,\n );\n\n this.backend = resolveEvaluatorBackend(normalizedOptions.backend);\n this.legacyEvaluator = new LegacyV3Evaluator(\n v3,\n normalizedOptions.modelName,\n normalizedOptions.modelClientOptions,\n );\n }\n\n async ask(options: EvaluateOptions): Promise<EvaluationResult> {\n return this.getLegacyBackend(\"ask\").ask(options);\n }\n\n async batchAsk(options: BatchAskOptions): Promise<EvaluationResult[]> {\n return this.getLegacyBackend(\"batchAsk\").batchAsk(options);\n }\n\n private getLegacyBackend(methodName: string): LegacyV3Evaluator {\n if (this.backend === \"legacy\") {\n return this.legacyEvaluator;\n }\n\n throw new StagehandInvalidArgumentError(\n `V3Evaluator.${methodName}() was configured with ${EVALUATOR_BACKEND_ENV}=verifier, but the verifier backend is not available in this build. Use \"legacy\" or install the verifier backend PR.`,\n );\n }\n}\n\nfunction normalizeConstructorOptions(\n modelNameOrOptions?: AvailableModel \| V3EvaluatorConstructorOptions,\n modelClientOptions?: ClientOptions,\n options?: V3EvaluatorOptions,\n): NormalizedConstructorOptions {\n if (\n modelNameOrOptions &&\n typeof modelNameOrOptions === \"object\" &&\n !Array.isArray(modelNameOrOptions)\n ) {\n return {\n modelName: modelNameOrOptions.modelName,\n modelClientOptions: modelNameOrOptions.modelClientOptions,\n backend: modelNameOrOptions.backend ?? options?.backend,\n };\n }\n\n return {\n modelName: modelNameOrOptions as AvailableModel \| undefined,\n modelClientOptions,\n backend: options?.backend,\n };\n}\n\nfunction resolveEvaluatorBackend(\n explicitBackend?: V3EvaluatorBackend,\n): V3EvaluatorBackend {\n const configuredBackend =\n explicitBackend ??\n process.env[EVALUATOR_BACKEND_ENV] ??\n DEFAULT_EVALUATOR_BACKEND;\n const normalizedBackend = configuredBackend.trim().toLowerCase();\n\n if (normalizedBackend === \"legacy\" \|\| normalizedBackend === \"verifier\") {\n return normalizedBackend;\n }\n\n throw new StagehandInvalidArgumentError(\n `Invalid ${EVALUATOR_BACKEND_ENV}=\"${configuredBackend}\". Expected \"legacy\" or \"verifier\".`,\n );\n}\n"]}
1	+ {"version":3,"file":"v3Evaluator.js","sourceRoot":"","sources":["../../../lib/v3Evaluator.ts"],"names":[],"mappings":";;;AAOA,iEAA+E;AAC/E,iEAA2D;AAW3D,MAAM,qBAAqB,GAAG,6BAA6B,CAAC;AAC5D,MAAM,yBAAyB,GAAuB,QAAQ,CAAC;AA2B/D,MAAa,WAAW;IACL,OAAO,CAAqB;IAC5B,eAAe,CAAoB;IAEpD,YACE,EAAM,EACN,kBAAmE,EACnE,kBAAkC,EAClC,OAA4B;QAE5B,MAAM,iBAAiB,GAAG,2BAA2B,CACnD,kBAAkB,EAClB,kBAAkB,EAClB,OAAO,CACR,CAAC;QAEF,IAAI,CAAC,OAAO,GAAG,uBAAuB,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC;QAClE,IAAI,CAAC,eAAe,GAAG,IAAI,wCAAiB,CAC1C,EAAE,EACF,iBAAiB,CAAC,SAAS,EAC3B,iBAAiB,CAAC,kBAAkB,CACrC,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,GAAG,CAAC,OAAwB;QAChC,OAAO,IAAI,CAAC,gBAAgB,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IACnD,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,OAAwB;QACrC,OAAO,IAAI,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;IAC7D,CAAC;IAED,KAAK,CAAC,MAAM,CAAC,UAAsB;QACjC,MAAM,QAAQ,GAAG,mBAAmB,CAAC,UAAU,CAAC,CAAC;QAEjD,IAAI,IAAI,CAAC,OAAO,KAAK,QAAQ,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAC,mCAAmC,CAAC,UAAU,EAAE,QAAQ,CAAC,CAAC;QACxE,CAAC;QAED,OAAO,IAAI,CAAC,0BAA0B,CAAC,QAAQ,CAAC,CAAC;IACnD,CAAC;IAED,KAAK,CAAC,cAAc,CAAC,QAAkB;QACrC,IAAI,CAAC,QAAQ,EAAE,EAAE,EAAE,CAAC;YAClB,MAAM,IAAI,4CAA6B,CACrC,+CAA+C,CAChD,CAAC;QACJ,CAAC;QAED,IAAI,IAAI,CAAC,OAAO,KAAK,UAAU,EAAE,CAAC;YAChC,OAAO,IAAI,CAAC,0BAA0B,CAAC,gBAAgB,CAAC,CAAC;QAC3D,CAAC;QAED,OAAO;YACL,KAAK,EAAE,CAAC,6BAA6B,CAAC,QAAQ,CAAC,CAAC;SACjD,CAAC;IACJ,CAAC;IAEO,gBAAgB,CAAC,UAAkB;QACzC,IAAI,IAAI,CAAC,OAAO,KAAK,QAAQ,EAAE,CAAC;YAC9B,OAAO,IAAI,CAAC,eAAe,CAAC;QAC9B,CAAC;QAED,OAAO,IAAI,CAAC,0BAA0B,CAAC,UAAU,CAAC,CAAC;IACrD,CAAC;IAEO,0BAA0B,CAAC,UAAkB;QACnD,MAAM,IAAI,4CAA6B,CACrC,eAAe,UAAU,0BAA0B,qBAAqB,sHAAsH,CAC/L,CAAC;IACJ,CAAC;IAEO,KAAK,CAAC,mCAAmC,CAC/C,UAAsB,EACtB,QAAkB;QAElB,MAAM,WAAW,GAAG,wBAAwB,CAAC,UAAU,CAAC,CAAC;QACzD,MAAM,cAAc,GAAG,0BAA0B,CAAC,UAAU,CAAC,CAAC;QAC9D,MAAM,MAAM,GAAG,UAAU,CAAC,WAAW,CAAC;QAEtC,IAAI,CAAC,WAAW,CAAC,MAAM,IAAI,CAAC,MAAM,EAAE,CAAC;YACnC,OAAO,gCAAgC,CACrC,qFAAqF,CACtF,CAAC;QACJ,CAAC;QAED,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC;YAC5C,QAAQ,EAAE,QAAQ,CAAC,WAAW;YAC9B,UAAU,EAAE,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,KAAK;YACpD,MAAM;YACN,cAAc;SACf,CAAC,CAAC;QAEH,OAAO,wBAAwB,CAAC,MAAM,EAAE,WAAW,CAAC,MAAM,CAAC,CAAC;IAC9D,CAAC;CACF;AA/FD,kCA+FC;AAED,SAAS,2BAA2B,CAClC,kBAAmE,EACnE,kBAAkC,EAClC,OAA4B;IAE5B,IACE,kBAAkB;QAClB,OAAO,kBAAkB,KAAK,QAAQ;QACtC,CAAC,KAAK,CAAC,OAAO,CAAC,kBAAkB,CAAC,EAClC,CAAC;QACD,OAAO;YACL,SAAS,EAAE,kBAAkB,CAAC,SAAS;YACvC,kBAAkB,EAAE,kBAAkB,CAAC,kBAAkB;YACzD,OAAO,EAAE,kBAAkB,CAAC,OAAO,IAAI,OAAO,EAAE,OAAO;SACxD,CAAC;IACJ,CAAC;IAED,OAAO;QACL,SAAS,EAAE,kBAAgD;QAC3D,kBAAkB;QAClB,OAAO,EAAE,OAAO,EAAE,OAAO;KAC1B,CAAC;AACJ,CAAC;AAED,SAAS,uBAAuB,CAC9B,eAAoC;IAEpC,MAAM,iBAAiB,GACrB,eAAe;QACf,OAAO,CAAC,GAAG,CAAC,qBAAqB,CAAC;QAClC,yBAAyB,CAAC;IAC5B,MAAM,iBAAiB,GAAG,iBAAiB,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAEjE,IAAI,iBAAiB,KAAK,QAAQ,IAAI,iBAAiB,KAAK,UAAU,EAAE,CAAC;QACvE,OAAO,iBAAiB,CAAC;IAC3B,CAAC;IAED,MAAM,IAAI,4CAA6B,CACrC,WAAW,qBAAqB,KAAK,iBAAiB,qCAAqC,CAC5F,CAAC;AACJ,CAAC;AAED,SAAS,mBAAmB,CAAC,UAAsB;IACjD,IAAI,CAAC,UAAU,EAAE,CAAC;QAChB,MAAM,IAAI,4CAA6B,CACrC,yCAAyC,CAC1C,CAAC;IACJ,CAAC;IACD,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC;QACzB,MAAM,IAAI,4CAA6B,CACrC,iDAAiD,CAClD,CAAC;IACJ,CAAC;IACD,OAAO,UAAU,CAAC,IAAI,CAAC;AACzB,CAAC;AAED,SAAS,6BAA6B,CAAC,QAAkB;IACvD,OAAO;QACL,SAAS,EAAE,wBAAwB;QACnC,WAAW,EAAE,yDAAyD,QAAQ,CAAC,WAAW,EAAE;QAC5F,SAAS,EAAE,CAAC;KACb,CAAC;AACJ,CAAC;AAED,SAAS,wBAAwB,CAAC,UAAsB;IACtD,MAAM,WAAW,GAAa,EAAE,CAAC;IAEjC,KAAK,MAAM,IAAI,IAAI,UAAU,CAAC,KAAK,IAAI,EAAE,EAAE,CAAC;QAC1C,IAAI,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,aAAa,EAAE,UAAU,CAAC,EAAE,CAAC;YACpD,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,UAAU,CAAC,CAAC;YAChD,SAAS;QACX,CAAC;QAED,MAAM,UAAU,GAAG,IAAI,CAAC,aAAa,EAAE,UAAU,EAAE,IAAI,CACrD,CACE,QAAQ,EACuD,EAAE,CACjE,QAAQ,CAAC,IAAI,KAAK,OAAO,IAAI,MAAM,CAAC,QAAQ,CAAC,QAAQ,CAAC,KAAK,CAAC,CAC/D,CAAC;QAEF,IAAI,UAAU,EAAE,CAAC;YACf,WAAW,CAAC,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC;QACrC,CAAC;IACH,CAAC;IAED,IAAI,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,gBAAgB,EAAE,UAAU,CAAC,EAAE,CAAC;QAC7D,WAAW,CAAC,IAAI,CAAC,UAAU,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC;IAC3D,CAAC;IAED,OAAO,WAAW,CAAC;AACrB,CAAC;AAED,SAAS,0BAA0B,CACjC,UAAsB;IAEtB,MAAM,SAAS,GAAG,CAAC,UAAU,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE;QACzD,MAAM,MAAM,GAAG,IAAI,CAAC,UAAU,EAAE,EAAE,KAAK,KAAK,CAAC,CAAC,CAAC,qBAAqB,CAAC,CAAC,CAAC,EAAE,CAAC;QAC1E,MAAM,MAAM,GAAG,IAAI,CAAC,UAAU,EAAE,KAAK;YACnC,CAAC,CAAC,eAAe,IAAI,CAAC,UAAU,CAAC,KAAK,EAAE;YACxC,CAAC,CAAC,gBAAgB,kBAAkB,CAAC,IAAI,CAAC,UAAU,EAAE,MAAM,CAAC,EAAE,CAAC;QAClE,OAAO;YACL,QAAQ,CAAC,KAAK,IAAI,CAAC,UAAU,EAAE;YAC/B,IAAI,CAAC,SAAS,CAAC,CAAC,CAAC,cAAc,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,SAAS;YAC3D,MAAM,IAAI,SAAS;YACnB,MAAM;SACP;aACE,MAAM,CAAC,OAAO,CAAC;aACf,IAAI,CAAC,IAAI,CAAC,CAAC;IAChB,CAAC,CAAC,CAAC;IAEH,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,CAAC;QACtB,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,OAAO,iBAAiB,CACtB,sBAAsB,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,EAC9C,KAAK,CACN,CAAC;AACJ,CAAC;AAED,SAAS,kBAAkB,CAAC,KAAc;IACxC,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;QAC9B,OAAO,KAAK,CAAC;IACf,CAAC;IAED,IAAI,CAAC;QACH,MAAM,UAAU,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;QACzC,OAAO,UAAU,IAAI,MAAM,CAAC,KAAK,CAAC,CAAC;IACrC,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC;IACvB,CAAC;AACH,CAAC;AAED,SAAS,iBAAiB,CAAC,KAAa,EAAE,SAAiB;IACzD,IAAI,KAAK,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC9B,OAAO,KAAK,CAAC;IACf,CAAC;IAED,OAAO,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,SAAS,CAAC,iBAAiB,CAAC;AACvD,CAAC;AAED,SAAS,wBAAwB,CAC/B,MAA8B,EAC9B,eAAuB;IAEvB,MAAM,cAAc,GAAG,MAAM,CAAC,UAAU,KAAK,KAAK,CAAC;IACnD,MAAM,OAAO,GAAG,MAAM,CAAC,UAAU,KAAK,SAAS,CAAC;IAChD,MAAM,QAAQ,GAAsB,OAAO;QACzC,CAAC,CAAC;YACE;gBACE,QAAQ,EAAE,sBAAsB;gBAChC,QAAQ,EAAE,SAAS;gBACnB,WAAW,EAAE,MAAM,CAAC,SAAS;aAC9B;SACF;QACH,CAAC,CAAC,EAAE,CAAC;IAEP,OAAO;QACL,cAAc;QACd,WAAW,EAAE,MAAM,CAAC,SAAS;QAC7B,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;QACxC,QAAQ,EAAE;YACR,OAAO,EAAE,QAAQ;YACjB,gBAAgB,EAAE,MAAM,CAAC,UAAU;YACnC,eAAe;SAChB;KACF,CAAC;AACJ,CAAC;AAED,SAAS,gCAAgC,CAAC,MAAc;IACtD,OAAO;QACL,cAAc,EAAE,KAAK;QACrB,WAAW,EAAE,MAAM;QACnB,QAAQ,EAAE;YACR;gBACE,QAAQ,EAAE,oBAAoB;gBAC9B,QAAQ,EAAE,UAAU;gBACpB,WAAW,EAAE,MAAM;aACpB;SACF;QACD,QAAQ,EAAE;YACR,OAAO,EAAE,QAAQ;YACjB,gBAAgB,EAAE,SAAS;YAC3B,eAAe,EAAE,CAAC;SACnB;KACF,CAAC;AACJ,CAAC","sourcesContent":["import type { AvailableModel, ClientOptions } from \"./v3/types/public/model.js\";\nimport type {\n EvaluateOptions,\n BatchAskOptions,\n EvaluationResult as LegacyEvaluationResult,\n} from \"./v3/types/private/evaluator.js\";\nimport { V3 } from \"./v3/v3.js\";\nimport { StagehandInvalidArgumentError } from \"./v3/types/public/sdkErrors.js\";\nimport { LegacyV3Evaluator } from \"./v3LegacyEvaluator.js\";\nimport type {\n Trajectory,\n TaskSpec,\n EvaluationResult,\n Rubric,\n Verifier,\n AgentEvidenceModality,\n VerifierFinding,\n} from \"./v3/verifier/index.js\";\n\nconst EVALUATOR_BACKEND_ENV = \"STAGEHAND_EVALUATOR_BACKEND\";\nconst DEFAULT_EVALUATOR_BACKEND: V3EvaluatorBackend = \"legacy\";\n\nexport type V3EvaluatorBackend = \"legacy\" \| \"verifier\";\n\nexport type V3EvaluatorOptions = {\n /*\n Selects the evaluator implementation.\n \n \"legacy\" preserves the existing screenshot/text YES/NO evaluator.\n * \"verifier\" is reserved for the rubric verifier backend.\n \n @default process.env.STAGEHAND_EVALUATOR_BACKEND \|\| \"legacy\"\n */\n backend?: V3EvaluatorBackend;\n};\n\nexport type V3EvaluatorConstructorOptions = V3EvaluatorOptions & {\n modelName?: AvailableModel;\n modelClientOptions?: ClientOptions;\n};\n\ntype NormalizedConstructorOptions = {\n modelName?: AvailableModel;\n modelClientOptions?: ClientOptions;\n backend?: V3EvaluatorBackend;\n};\n\nexport class V3Evaluator implements Verifier {\n private readonly backend: V3EvaluatorBackend;\n private readonly legacyEvaluator: LegacyV3Evaluator;\n\n constructor(\n v3: V3,\n modelNameOrOptions?: AvailableModel \| V3EvaluatorConstructorOptions,\n modelClientOptions?: ClientOptions,\n options?: V3EvaluatorOptions,\n ) {\n const normalizedOptions = normalizeConstructorOptions(\n modelNameOrOptions,\n modelClientOptions,\n options,\n );\n\n this.backend = resolveEvaluatorBackend(normalizedOptions.backend);\n this.legacyEvaluator = new LegacyV3Evaluator(\n v3,\n normalizedOptions.modelName,\n normalizedOptions.modelClientOptions,\n );\n }\n\n async ask(options: EvaluateOptions): Promise<LegacyEvaluationResult> {\n return this.getLegacyBackend(\"ask\").ask(options);\n }\n\n async batchAsk(options: BatchAskOptions): Promise<LegacyEvaluationResult[]> {\n return this.getLegacyBackend(\"batchAsk\").batchAsk(options);\n }\n\n async verify(trajectory: Trajectory): Promise<EvaluationResult> {\n const taskSpec = assertVerifierInput(trajectory);\n\n if (this.backend === \"legacy\") {\n return this.verifyTrajectoryWithLegacyEvaluator(trajectory, taskSpec);\n }\n\n return this.unavailableVerifierBackend(\"verify\");\n }\n\n async generateRubric(taskSpec: TaskSpec): Promise<Rubric> {\n if (!taskSpec?.id) {\n throw new StagehandInvalidArgumentError(\n \"TaskSpec.id is required for rubric generation\",\n );\n }\n\n if (this.backend === \"verifier\") {\n return this.unavailableVerifierBackend(\"generateRubric\");\n }\n\n return {\n items: [legacyTaskCompletionCriterion(taskSpec)],\n };\n }\n\n private getLegacyBackend(methodName: string): LegacyV3Evaluator {\n if (this.backend === \"legacy\") {\n return this.legacyEvaluator;\n }\n\n return this.unavailableVerifierBackend(methodName);\n }\n\n private unavailableVerifierBackend(methodName: string): never {\n throw new StagehandInvalidArgumentError(\n `V3Evaluator.${methodName}() was configured with ${EVALUATOR_BACKEND_ENV}=verifier, but the verifier backend is not available in this build. Use \"legacy\" or install the verifier backend PR.`,\n );\n }\n\n private async verifyTrajectoryWithLegacyEvaluator(\n trajectory: Trajectory,\n taskSpec: TaskSpec,\n ): Promise<EvaluationResult> {\n const screenshots = collectLegacyScreenshots(trajectory);\n const agentReasoning = renderLegacyAgentReasoning(trajectory);\n const answer = trajectory.finalAnswer;\n\n if (!screenshots.length && !answer) {\n return legacyInsufficientEvidenceResult(\n \"Legacy evaluator compatibility mode had no screenshots or final answer to evaluate.\",\n );\n }\n\n const result = await this.legacyEvaluator.ask({\n question: taskSpec.instruction,\n screenshot: screenshots.length ? screenshots : false,\n answer,\n agentReasoning,\n });\n\n return legacyEvaluationToResult(result, screenshots.length);\n }\n}\n\nfunction normalizeConstructorOptions(\n modelNameOrOptions?: AvailableModel \| V3EvaluatorConstructorOptions,\n modelClientOptions?: ClientOptions,\n options?: V3EvaluatorOptions,\n): NormalizedConstructorOptions {\n if (\n modelNameOrOptions &&\n typeof modelNameOrOptions === \"object\" &&\n !Array.isArray(modelNameOrOptions)\n ) {\n return {\n modelName: modelNameOrOptions.modelName,\n modelClientOptions: modelNameOrOptions.modelClientOptions,\n backend: modelNameOrOptions.backend ?? options?.backend,\n };\n }\n\n return {\n modelName: modelNameOrOptions as AvailableModel \| undefined,\n modelClientOptions,\n backend: options?.backend,\n };\n}\n\nfunction resolveEvaluatorBackend(\n explicitBackend?: V3EvaluatorBackend,\n): V3EvaluatorBackend {\n const configuredBackend =\n explicitBackend ??\n process.env[EVALUATOR_BACKEND_ENV] ??\n DEFAULT_EVALUATOR_BACKEND;\n const normalizedBackend = configuredBackend.trim().toLowerCase();\n\n if (normalizedBackend === \"legacy\" \|\| normalizedBackend === \"verifier\") {\n return normalizedBackend;\n }\n\n throw new StagehandInvalidArgumentError(\n `Invalid ${EVALUATOR_BACKEND_ENV}=\"${configuredBackend}\". Expected \"legacy\" or \"verifier\".`,\n );\n}\n\nfunction assertVerifierInput(trajectory: Trajectory): TaskSpec {\n if (!trajectory) {\n throw new StagehandInvalidArgumentError(\n \"Trajectory is required for verification\",\n );\n }\n if (!trajectory.task?.id) {\n throw new StagehandInvalidArgumentError(\n \"Trajectory.task.id is required for verification\",\n );\n }\n return trajectory.task;\n}\n\nfunction legacyTaskCompletionCriterion(taskSpec: TaskSpec) {\n return {\n criterion: \"legacy-task-completion\",\n description: `Evaluate whether the task was completed successfully: ${taskSpec.instruction}`,\n maxPoints: 1,\n };\n}\n\nfunction collectLegacyScreenshots(trajectory: Trajectory): Buffer[] {\n const screenshots: Buffer[] = [];\n\n for (const step of trajectory.steps ?? []) {\n if (Buffer.isBuffer(step.probeEvidence?.screenshot)) {\n screenshots.push(step.probeEvidence.screenshot);\n continue;\n }\n\n const agentImage = step.agentEvidence?.modalities?.find(\n (\n modality,\n ): modality is Extract<AgentEvidenceModality, { type: \"image\" }> =>\n modality.type === \"image\" && Buffer.isBuffer(modality.bytes),\n );\n\n if (agentImage) {\n screenshots.push(agentImage.bytes);\n }\n }\n\n if (Buffer.isBuffer(trajectory.finalObservation?.screenshot)) {\n screenshots.push(trajectory.finalObservation.screenshot);\n }\n\n return screenshots;\n}\n\nfunction renderLegacyAgentReasoning(\n trajectory: Trajectory,\n): string \| undefined {\n const stepLines = (trajectory.steps ?? []).map((step, i) => {\n const status = step.toolOutput?.ok === false ? \"Tool status: failed\" : \"\";\n const output = step.toolOutput?.error\n ? `Tool error: ${step.toolOutput.error}`\n : `Tool output: ${stringifyForPrompt(step.toolOutput?.result)}`;\n return [\n `Step ${i}: ${step.actionName}`,\n step.reasoning ? `Reasoning: ${step.reasoning}` : undefined,\n status \|\| undefined,\n output,\n ]\n .filter(Boolean)\n .join(\"\\n\");\n });\n\n if (!stepLines.length) {\n return undefined;\n }\n\n return truncateForPrompt(\n `Agent trajectory:\\n${stepLines.join(\"\\n\\n\")}`,\n 16000,\n );\n}\n\nfunction stringifyForPrompt(value: unknown): string {\n if (typeof value === \"string\") {\n return value;\n }\n\n try {\n const serialized = JSON.stringify(value);\n return serialized ?? String(value);\n } catch {\n return String(value);\n }\n}\n\nfunction truncateForPrompt(value: string, maxLength: number): string {\n if (value.length <= maxLength) {\n return value;\n }\n\n return `${value.slice(0, maxLength)}... [truncated]`;\n}\n\nfunction legacyEvaluationToResult(\n result: LegacyEvaluationResult,\n screenshotCount: number,\n): EvaluationResult {\n const outcomeSuccess = result.evaluation === \"YES\";\n const invalid = result.evaluation === \"INVALID\";\n const findings: VerifierFinding[] = invalid\n ? [\n {\n category: \"verifier_uncertainty\",\n severity: \"warning\",\n description: result.reasoning,\n },\n ]\n : [];\n\n return {\n outcomeSuccess,\n explanation: result.reasoning,\n ...(findings.length ? { findings } : {}),\n rawSteps: {\n backend: \"legacy\",\n legacyEvaluation: result.evaluation,\n screenshotCount,\n },\n };\n}\n\nfunction legacyInsufficientEvidenceResult(reason: string): EvaluationResult {\n return {\n outcomeSuccess: false,\n explanation: reason,\n findings: [\n {\n category: \"trajectory_capture\",\n severity: \"blocking\",\n description: reason,\n },\n ],\n rawSteps: {\n backend: \"legacy\",\n legacyEvaluation: \"INVALID\",\n screenshotCount: 0,\n },\n };\n}\n"]}

package/dist/cjs/lib/v3LegacyEvaluator.js CHANGED Viewed

@@ -43,6 +43,7 @@ class LegacyV3Evaluator {
         if (Array.isArray(screenshot)) {
             return this._evaluateWithMultipleScreenshots({
                 question,
+                answer,
                 screenshots: screenshot,
                 systemPrompt,
                 agentReasoning,
@@ -162,7 +163,7 @@ class LegacyV3Evaluator {
         }
     }
     async _evaluateWithMultipleScreenshots(options) {
-        const { question, screenshots, agentReasoning, systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
+        const { question, answer, screenshots, agentReasoning, systemPrompt = `You are an expert evaluator that confidently returns YES or NO given a question and multiple screenshots showing the progression of a task.
         ${agentReasoning ? "You also have access to the agent's detailed reasoning and thought process throughout the task." : ""}
         Analyze ALL screenshots to understand the complete journey. Look for evidence of task completion across all screenshots, not just the last one.
         Success criteria may appear at different points in the sequence (confirmation messages, intermediate states, etc).
@@ -191,6 +192,9 @@ class LegacyV3Evaluator {
                                     ? `Question: ${question}\n\nAgent's reasoning and actions throughout the task:\n${agentReasoning}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze both the agent's reasoning and all screenshots to determine if the task was completed successfully.`
                                     : `${question}\n\nI'm providing ${screenshots.length} screenshots showing the progression of the task. Please analyze all of them to determine if the task was completed successfully.`,
                             },
+                            ...(answer
+                                ? [{ type: "text", text: `the answer is ${answer}` }]
+                                : []),
                             ...imageContents,
                         ],
                     },