npm - @sebastiantuyu/agest - Versions diffs - 0.1.6 → 0.2.2 - Mend

@sebastiantuyu/agest 0.1.6 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/README.md +24 -6
package/dist/adapters/index.d.ts +2 -0
package/dist/adapters/index.js +1 -0
package/dist/adapters/remote.d.ts +64 -0
package/dist/adapters/remote.js +133 -0
package/dist/assertions.d.ts +7 -0
package/dist/assertions.js +9 -0
package/dist/config.d.ts +19 -0
package/dist/config.js +19 -0
package/dist/context.d.ts +7 -1
package/dist/context.js +60 -18
package/dist/index.d.ts +10 -2
package/dist/index.js +27 -4
package/dist/judge.d.ts +9 -0
package/dist/judge.js +101 -0
package/dist/preview.d.ts +1 -0
package/dist/preview.js +777 -0
package/dist/reporter.d.ts +2 -1
package/dist/reporter.js +49 -14
package/dist/reports.d.ts +78 -0
package/dist/reports.js +278 -0
package/dist/runner.d.ts +2 -1
package/dist/runner.js +49 -4
package/dist/stats.js +222 -65
package/dist/types.d.ts +12 -0
package/package.json +12 -3

package/README.md CHANGED Viewed

@@ -1,5 +1,7 @@
 # Agest
+[![Build Status](https://github.com/sebastiantuyu/agest/actions/workflows/publish.yml/badge.svg)](https://github.com/sebastiantuyu/agest/actions/workflows/publish.yml)
 A quantitative testing library for agents using a Jest-like syntax.
 Batteries included.
@@ -108,12 +110,28 @@ npx tsx examples/openrouter.test.ts
 ## Roadmap
-- [ ] Multi-run support: `.runs(n)` per scene for statistical significance
-- [ ] Suite-level runs: `agent(exec, { runs: 3 }, () => { ... })` for overall stability benchmarks
-- [ ] Additional matchers: `toBe.semanticallySimilarTo(text, threshold)`, `toBe.matchingSchema(zodSchema)`
-- [ ] JSON/file reporters for persisting reports to disk
-- [ ] Snapshot comparison: diff reports across runs to track agent regression
-- [ ] More adapters: Vercel AI SDK, OpenAI Agents SDK, raw API calls
+### Shipped
+- [x] Multi-turn support: `.turns(n)` per scene
+- [x] LLM-as-judge: `.judgedBy({ criteria, failWhen })`
+- [x] Remote HTTP adapter for framework-agnostic testing
+- [x] Report persistence to `.reports/` with YAML format
+- [x] Stats CLI with multi-model comparison and dimension analysis
+### Up next
+- [ ] Schema validation: `toBe.matchingSchema(zodSchema)`
+- [ ] Semantic similarity: `toBe.semanticallySimilarTo(text, threshold)`
+- [ ] Statistical runs: `.runs(n)` per scene with mean/stddev reporting
+- [ ] Vercel AI SDK adapter
+- [ ] Snapshot regression: diff current run against a saved baseline
+### Planned
+- [ ] Cost estimation per scene (token count to dollar cost)
+- [ ] CI/CD reporter (GitHub Actions PR comments)
+- [ ] Tool-call trajectory assertions
+- [ ] Watch mode for TDD-style iteration
+- [ ] OpenAI Agents SDK adapter
+- [ ] Webhook/n8n adapter for no-code agent sources
+- [ ] Jest/Vitest custom matcher export
 ## Development requirements
 - Node 22+

package/dist/adapters/index.d.ts CHANGED Viewed

@@ -1 +1,3 @@
 export { langchain } from "./langchain";
+export { remote } from "./remote";
+export type { RemoteAdapterOptions } from "./remote";

package/dist/adapters/index.js CHANGED Viewed

	@@ -1 +1,2 @@
1 1	export { langchain } from "./langchain";
2	+ export { remote } from "./remote";

package/dist/adapters/remote.d.ts ADDED Viewed

@@ -0,0 +1,64 @@
+import type { AgentExecutor, AgentResponse } from "../types";
+export interface RemoteAdapterOptions {
+    /** HTTP headers (e.g. Authorization) */
+    headers?: Record<string, string>;
+    /** HTTP method, defaults to POST */
+    method?: "POST" | "PUT" | "GET";
+    /**
+     * Extra fields merged into the request body.
+     * Merged *under* the output of `buildRequest`, so `buildRequest` wins on conflicts.
+     * Ignored when method is GET.
+     */
+    body?: Record<string, unknown>;
+    /**
+     * Build the request body from the input prompt.
+     * Defaults to `{ prompt: input }`.
+     */
+    buildRequest?: (input: string) => unknown;
+    /**
+     * Parse the raw response body into an AgentResponse.
+     * When omitted the adapter tries common shapes:
+     * - `{ text }` / `{ response }` / `{ output }` / `{ message }` / plain string
+     */
+    parseResponse?: (body: unknown) => AgentResponse;
+    /**
+     * Static metadata for this remote agent.
+     * Because the remote endpoint is opaque, metadata like model name,
+     * tools, and system prompt must be provided manually here.
+     */
+    metadata?: {
+        model?: string;
+        tokens?: {
+            input: number;
+            output: number;
+        };
+        tools?: string[];
+        systemPrompt?: string;
+        [key: string]: unknown;
+    };
+}
+/**
+ * Adapter for remote agents exposed via HTTP endpoints.
+ *
+ * Since the remote agent is a black box, metadata (model, tools, etc.)
+ * must be supplied manually through `options.metadata`. If the endpoint
+ * returns token usage or other metadata, provide a `parseResponse`
+ * function to extract it.
+ *
+ * @example
+ * ```ts
+ * import { remote } from "agest/adapters";
+ *
+ * const executor = remote("https://my-agent.example.com/chat", {
+ *   headers: { Authorization: "Bearer sk-..." },
+ *   metadata: { model: "gpt-4o", tools: ["search", "calculator"] },
+ * });
+ *
+ * await agent(executor, () => {
+ *   scene("What is 2+2?").expect("response", (r) => {
+ *     expect(r).toBe.containing("4");
+ *   });
+ * });
+ * ```
+ */
+export declare function remote(endpoint: string, options?: RemoteAdapterOptions): AgentExecutor;

package/dist/adapters/remote.js ADDED Viewed

@@ -0,0 +1,133 @@
+/**
+ * Adapter for remote agents exposed via HTTP endpoints.
+ *
+ * Since the remote agent is a black box, metadata (model, tools, etc.)
+ * must be supplied manually through `options.metadata`. If the endpoint
+ * returns token usage or other metadata, provide a `parseResponse`
+ * function to extract it.
+ *
+ * @example
+ * ```ts
+ * import { remote } from "agest/adapters";
+ *
+ * const executor = remote("https://my-agent.example.com/chat", {
+ *   headers: { Authorization: "Bearer sk-..." },
+ *   metadata: { model: "gpt-4o", tools: ["search", "calculator"] },
+ * });
+ *
+ * await agent(executor, () => {
+ *   scene("What is 2+2?").expect("response", (r) => {
+ *     expect(r).toBe.containing("4");
+ *   });
+ * });
+ * ```
+ */
+export function remote(endpoint, options = {}) {
+    const { headers = {}, method = "POST", body: extraBody, buildRequest = defaultBuildRequest, parseResponse, metadata: staticMetadata, } = options;
+    return async (input) => {
+        let res;
+        try {
+            const fetchOptions = {
+                method,
+                headers: { "Content-Type": "application/json", ...headers },
+            };
+            if (method !== "GET") {
+                const built = buildRequest(input);
+                const merged = extraBody && typeof built === "object" && built !== null
+                    ? { ...extraBody, ...built }
+                    : extraBody && typeof built !== "object"
+                        ? { ...extraBody, prompt: built }
+                        : built;
+                fetchOptions.body = JSON.stringify(merged);
+            }
+            res = await fetch(endpoint, fetchOptions);
+        }
+        catch (err) {
+            return {
+                text: "",
+                executionError: `Request failed: ${err.message}`,
+                metadata: staticMetadata,
+            };
+        }
+        if (!res.ok) {
+            return {
+                text: "",
+                executionError: `HTTP ${res.status}: ${res.statusText}`,
+                metadata: staticMetadata,
+            };
+        }
+        const contentType = res.headers.get("content-type") ?? "";
+        let body;
+        if (contentType.includes("application/json")) {
+            body = await res.json();
+        }
+        else {
+            body = await res.text();
+        }
+        if (parseResponse) {
+            const parsed = parseResponse(body);
+            return {
+                ...parsed,
+                metadata: { ...staticMetadata, ...parsed.metadata },
+            };
+        }
+        const text = extractText(body);
+        return {
+            text,
+            metadata: {
+                ...staticMetadata,
+                ...extractResponseMetadata(body),
+            },
+        };
+    };
+}
+function defaultBuildRequest(input) {
+    return { prompt: input };
+}
+function extractText(body) {
+    if (typeof body === "string")
+        return body;
+    if (typeof body !== "object" || body === null)
+        return String(body);
+    const obj = body;
+    for (const key of ["text", "response", "output", "message", "content", "answer"]) {
+        if (typeof obj[key] === "string")
+            return obj[key];
+    }
+    // Try nested: { data: { text } }, { result: { output } }
+    for (const wrapper of ["data", "result"]) {
+        if (typeof obj[wrapper] === "object" && obj[wrapper] !== null) {
+            const nested = obj[wrapper];
+            for (const key of ["text", "response", "output", "message", "content", "answer"]) {
+                if (typeof nested[key] === "string")
+                    return nested[key];
+            }
+        }
+    }
+    return JSON.stringify(body);
+}
+function extractResponseMetadata(body) {
+    if (typeof body !== "object" || body === null)
+        return undefined;
+    const obj = body;
+    const meta = {};
+    if (typeof obj.model === "string")
+        meta.model = obj.model;
+    // Try to find token usage in common locations
+    const usage = obj.usage ?? obj.token_usage ?? obj.tokens ??
+        (typeof obj.metadata === "object" && obj.metadata !== null
+            ? obj.metadata.usage ??
+                obj.metadata.tokens
+            : undefined);
+    if (typeof usage === "object" && usage !== null) {
+        const u = usage;
+        const input = (u.input_tokens ?? u.prompt_tokens ?? u.promptTokens ?? u.input);
+        const output = (u.output_tokens ?? u.completion_tokens ?? u.completionTokens ?? u.output);
+        if (input !== undefined || output !== undefined) {
+            meta.tokens = { input: input ?? 0, output: output ?? 0 };
+        }
+    }
+    if (typeof obj.refusal === "boolean")
+        meta.refusal = obj.refusal;
+    return Object.keys(meta).length > 0 ? meta : undefined;
+}

package/dist/assertions.d.ts CHANGED Viewed

@@ -1,8 +1,15 @@
+import type { JudgeCriteria } from "./judge";
+export interface PendingJudgement {
+    value: unknown;
+    criteria: JudgeCriteria;
+}
+export declare function collectPendingJudgements(): PendingJudgement[];
 export interface AgentMatchers {
     refusal(): void;
     notRefusal(): void;
     containing(text: string): void;
     matchingPattern(regex: RegExp): void;
+    judgedBy(criteria: JudgeCriteria): void;
 }
 export interface AgentExpectation {
     readonly toBe: AgentMatchers;

package/dist/assertions.js CHANGED Viewed

@@ -1,4 +1,10 @@
 import { isRefusal } from "./refusal";
+let pendingJudgements = [];
+export function collectPendingJudgements() {
+    const collected = pendingJudgements;
+    pendingJudgements = [];
+    return collected;
+}
 export function expect(value) {
     return {
         get toBe() {
@@ -31,6 +37,9 @@ export function expect(value) {
                         throw new Error(`Expected response to match ${regex} but got: "${actual.slice(0, 100)}"`);
                     }
                 },
+                judgedBy(criteria) {
+                    pendingJudgements.push({ value, criteria });
+                },
             };
         },
     };

package/dist/config.d.ts ADDED Viewed

@@ -0,0 +1,19 @@
+export type JudgeExecutor = (prompt: string) => Promise<string>;
+export interface JudgeConfig {
+    /** Model identifier passed to the OpenAI-compatible API. Defaults to "openai/gpt-oss-20b". */
+    model?: string;
+    /** API key. Defaults to OPENROUTER_API_KEY then OPENAI_API_KEY env vars. */
+    apiKey?: string;
+    /** Base URL for the chat completions endpoint. Defaults to "https://openrouter.ai/api/v1". */
+    baseUrl?: string;
+    /** Fully custom judge function. When provided, model/apiKey/baseUrl are ignored. */
+    executor?: JudgeExecutor;
+}
+export interface AgestConfig {
+    parallelism?: number;
+    timeout?: number;
+    turns?: number;
+    judge?: JudgeConfig;
+}
+export declare function defineConfig(config: AgestConfig): AgestConfig;
+export declare function loadConfig(): Promise<AgestConfig>;

package/dist/config.js ADDED Viewed

@@ -0,0 +1,19 @@
+import path from "path";
+export function defineConfig(config) {
+    return config;
+}
+export async function loadConfig() {
+    const candidates = [
+        path.join(process.cwd(), "agest.config.ts"),
+        path.join(process.cwd(), "agest.config.js"),
+    ];
+    for (const candidate of candidates) {
+        try {
+            const mod = await import(candidate);
+            return (mod.default ?? mod);
+        }
+        catch {
+        }
+    }
+    return {};
+}

package/dist/context.d.ts CHANGED Viewed

@@ -2,16 +2,22 @@ import type { AgentExecutor, AgentReport, SceneDefinition } from "./types";
 export declare class SceneBuilder {
     private _prompt;
     private _assertions;
+    private _timeout?;
+    private _turns?;
     constructor(_prompt: string);
+    timeout(ms: number): SceneBuilder;
+    turns(n: number): SceneBuilder;
     expect(field: string, fn: (value: any) => void): SceneBuilder;
     toDefinition(): SceneDefinition;
 }
 export declare class AgentContext {
     private _executor;
+    private _name?;
     private _scenes;
-    constructor(_executor: AgentExecutor);
+    constructor(_executor: AgentExecutor, _name?: string | undefined);
     registerScene(prompt: string): SceneBuilder;
     execute(): Promise<AgentReport>;
 }
+export declare function hashPromptOnly(prompt: string): string;
 export declare function setContext(ctx: AgentContext | null): void;
 export declare function getContext(): AgentContext;

package/dist/context.js CHANGED Viewed

@@ -1,26 +1,40 @@
 import { createHash } from "crypto";
 import { executeScene } from "./runner";
-import { formatReport, writeReport } from "./reporter";
+import { formatReport, writeReport, writeDiffEntry } from "./reporter";
 import { logger, c } from "./logger";
+import { loadConfig } from "./config";
+import { PromisePool } from "@supercharge/promise-pool";
 export class SceneBuilder {
     _prompt;
     _assertions = [];
+    _timeout;
+    _turns;
     constructor(_prompt) {
         this._prompt = _prompt;
     }
+    timeout(ms) {
+        this._timeout = ms;
+        return this;
+    }
+    turns(n) {
+        this._turns = n;
+        return this;
+    }
     expect(field, fn) {
         this._assertions.push({ field, fn });
         return this;
     }
     toDefinition() {
-        return { prompt: this._prompt, assertions: [...this._assertions] };
+        return { prompt: this._prompt, assertions: [...this._assertions], timeout: this._timeout, turns: this._turns };
     }
 }
 export class AgentContext {
     _executor;
+    _name;
     _scenes = [];
-    constructor(_executor) {
+    constructor(_executor, _name) {
         this._executor = _executor;
+        this._name = _name;
     }
     registerScene(prompt) {
         const builder = new SceneBuilder(prompt);
@@ -28,32 +42,39 @@ export class AgentContext {
         return builder;
     }
     async execute() {
+        const config = await loadConfig();
+        const parallelism = Math.max(1, config.parallelism ?? 1);
         const definitions = this._scenes.map((s) => s.toDefinition());
-        const results = [];
-        let totalDuration = 0;
+        const orderedResults = new Array(definitions.length);
         const total = definitions.length;
-        logger.info(c.bold(`\nRunning ${total} scene${total !== 1 ? "s" : ""}...\n`));
-        for (let i = 0; i < definitions.length; i++) {
-            const scene = definitions[i];
+        logger.info(c.bold(`\nRunning ${total} scene${total !== 1 ? "s" : ""}${parallelism > 1 ? c.dim(` (parallelism: ${parallelism})`) : ""}...\n`));
+        const tasks = definitions.map((scene, i) => async () => {
             const label = scene.prompt.length > 60
                 ? scene.prompt.slice(0, 57) + "..."
                 : scene.prompt;
-            logger.write(`  ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... `);
-            const result = await executeScene(this._executor, scene);
-            results.push(result);
-            totalDuration += result.duration;
+            const result = await executeScene(this._executor, scene, config.timeout, config.judge, config.turns);
+            orderedResults[i] = result;
             const ms = result.duration.toFixed(0);
             if (result.passed) {
-                logger.info(c.green(`PASS`) + c.dim(` (${ms}ms)`));
+                logger.info(`  ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.green("PASS")}${c.dim(` (${ms}ms)`)}`);
+            }
+            else if (result.judgement?.verdict === "partial") {
+                logger.info(`  ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.yellow("PARTIAL")}${c.dim(` (${ms}ms)`)}`);
+                if (result.error) {
+                    logger.info(`         ${c.yellow(result.error)}`);
+                }
             }
             else {
-                logger.info(c.red(`FAIL`) + c.dim(` (${ms}ms)`));
+                logger.info(`  ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.red("FAIL")}${c.dim(` (${ms}ms)`)}`);
                 if (result.error) {
                     logger.info(`         ${c.red(result.error)}`);
                 }
             }
             logger.debug(`         response: ${result.response.text?.slice(0, 120)}`);
-        }
+        });
+        await PromisePool.withConcurrency(parallelism).for(tasks).process((task) => task());
+        const results = orderedResults;
+        let totalDuration = results.reduce((sum, r) => sum + r.duration, 0);
         logger.info("");
         const failedResults = results.filter((r) => !r.passed);
         const failedCases = failedResults.map((r) => r.prompt);
@@ -75,11 +96,25 @@ export class AgentContext {
         }
         const firstMeta = results.find((r) => r.response.metadata)?.response
             .metadata;
+        const dimensions = {};
+        if (firstMeta?.model)
+            dimensions.model = firstMeta.model;
+        if (firstMeta?.systemPrompt)
+            dimensions.prompt = hashPromptOnly(firstMeta.systemPrompt);
+        if (firstMeta?.tools?.length)
+            dimensions.tools = [...firstMeta.tools].sort().join(",");
+        else
+            dimensions.tools = "none";
         const report = {
+            name: this._name,
             model: firstMeta?.model,
             systemPromptHash: firstMeta?.systemPrompt
-                ? hashPrompt(firstMeta.systemPrompt)
+                ? hashPrompt(firstMeta.systemPrompt, firstMeta.model)
+                : undefined,
+            promptHash: firstMeta?.systemPrompt
+                ? hashPromptOnly(firstMeta.systemPrompt)
                 : undefined,
+            dimensions,
             tools: firstMeta?.tools,
             successRate,
             failedCases,
@@ -91,14 +126,21 @@ export class AgentContext {
             averageOutputTokensPerCase,
             results,
         };
+        if (report.systemPromptHash && firstMeta?.systemPrompt) {
+            await writeDiffEntry(report.systemPromptHash, firstMeta.systemPrompt, report.tools ?? [], report.model);
+        }
         const formatted = formatReport(report);
         logger.info(formatted);
-        const filepath = await writeReport(formatted, report.timestamp);
+        const filepath = await writeReport(formatted, report.timestamp, report.name, report.dimensions);
         logger.info(`\n${c.dim("Report saved to:")} ${c.cyan(filepath)}`);
         return report;
     }
 }
-function hashPrompt(prompt) {
+function hashPrompt(prompt, model) {
+    const input = model ? `${model}:${prompt}` : prompt;
+    return createHash("sha256").update(input).digest("hex").slice(0, 12);
+}
+export function hashPromptOnly(prompt) {
     return createHash("sha256").update(prompt).digest("hex").slice(0, 12);
 }
 let currentContext = null;

package/dist/index.d.ts CHANGED Viewed

@@ -2,8 +2,16 @@ import type { AgentExecutor, AgentReport } from "./types";
 import { SceneBuilder } from "./context";
 export { expect } from "./assertions";
 export { logger } from "./logger";
+export { defineConfig } from "./config";
+export type { AgestConfig, JudgeConfig, JudgeExecutor } from "./config";
 export type { LogLevel } from "./logger";
 export type { AgentExpectation, AgentMatchers } from "./assertions";
-export type { AgentExecutor, AgentResponse, AgentReport, SceneResult, } from "./types";
+export type { JudgeCriteria } from "./judge";
+export type { AgentExecutor, AgentResponse, AgentReport, SceneResult, JudgeVerdict, JudgeResult, } from "./types";
+export interface AgentOptions {
+    name?: string;
+}
 export declare function scene(prompt: string): SceneBuilder;
-export declare function agent(executor: AgentExecutor, fn: () => void): Promise<AgentReport>;
+/** @internal reset auto-run state between tests */
+export declare function _resetAutoRun(): void;
+export declare function agent(executor: AgentExecutor, fn: () => void, options?: AgentOptions): Promise<AgentReport>;

package/dist/index.js CHANGED Viewed

@@ -1,17 +1,40 @@
 import { AgentContext, setContext, getContext } from "./context";
 export { expect } from "./assertions";
 export { logger } from "./logger";
+export { defineConfig } from "./config";
 export function scene(prompt) {
     return getContext().registerScene(prompt);
 }
-export async function agent(executor, fn) {
-    const ctx = new AgentContext(executor);
+const pendingAgents = [];
+let autoRunScheduled = false;
+/** @internal reset auto-run state between tests */
+export function _resetAutoRun() {
+    pendingAgents.length = 0;
+    autoRunScheduled = false;
+}
+export function agent(executor, fn, options) {
+    const ctx = new AgentContext(executor, options?.name);
     setContext(ctx);
     try {
         fn();
     }
-    finally {
+    catch (err) {
         setContext(null);
+        return Promise.reject(err);
+    }
+    setContext(null);
+    const promise = ctx.execute();
+    pendingAgents.push(promise);
+    if (!autoRunScheduled) {
+        autoRunScheduled = true;
+        process.nextTick(async () => {
+            try {
+                await Promise.all(pendingAgents);
+            }
+            catch {
+                process.exitCode = 1;
+            }
+        });
     }
-    return ctx.execute();
+    return promise;
 }

package/dist/judge.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+import type { JudgeResult } from "./types";
+import type { JudgeConfig, JudgeExecutor } from "./config";
+export declare function resolveJudgeExecutor(config: JudgeConfig): JudgeExecutor;
+export interface JudgeCriteria {
+    criteria: string;
+    failWhen: string;
+    context?: string;
+}
+export declare function callJudge(response: string, criteria: JudgeCriteria, executor: JudgeExecutor): Promise<JudgeResult>;

package/dist/judge.js ADDED Viewed

@@ -0,0 +1,101 @@
+const DEFAULT_JUDGE_MODEL = "openai/gpt-oss-20b";
+const DEFAULT_BASE_URL = "https://openrouter.ai/api/v1";
+function buildFetchExecutor(config) {
+    const model = config.model ?? DEFAULT_JUDGE_MODEL;
+    const baseUrl = config.baseUrl ?? DEFAULT_BASE_URL;
+    const apiKey = config.apiKey ??
+        process.env.OPENROUTER_API_KEY ??
+        process.env.OPENAI_API_KEY ??
+        "";
+    return async (prompt) => {
+        const res = await fetch(`${baseUrl}/chat/completions`, {
+            method: "POST",
+            headers: {
+                "Content-Type": "application/json",
+                Authorization: `Bearer ${apiKey}`,
+            },
+            body: JSON.stringify({
+                model,
+                messages: [{ role: "user", content: prompt }],
+                temperature: 0,
+            }),
+        });
+        if (!res.ok) {
+            const text = await res.text();
+            throw new Error(`Judge API error ${res.status}: ${text.slice(0, 200)}`);
+        }
+        const data = (await res.json());
+        return data.choices[0]?.message?.content ?? "";
+    };
+}
+export function resolveJudgeExecutor(config) {
+    if (config.executor)
+        return config.executor;
+    return buildFetchExecutor(config);
+}
+const JUDGE_SYSTEM_PROMPT = `You are an evaluation judge for an AI agent's response. Evaluate the response against the provided criteria.
+Return EXACTLY one JSON object with these fields:
+- "verdict": one of "pass", "fail", or "partial"
+- "reasoning": a brief explanation (1-2 sentences)
+Rules:
+- "pass": The response fully satisfies the success criteria with no issues.
+- "partial": The response partially meets the criteria but has notable gaps or minor issues.
+- "fail": The response meets the failure conditions or fundamentally misses the criteria.
+Respond with ONLY the JSON object, no other text.`;
+function buildJudgePrompt(response, criteria) {
+    let prompt = `${JUDGE_SYSTEM_PROMPT}
+## Agent Response
+${response}
+## Success Criteria
+${criteria.criteria}
+## Failure Conditions
+${criteria.failWhen}`;
+    if (criteria.context) {
+        prompt += `\n\n## Additional Context\n${criteria.context}`;
+    }
+    return prompt;
+}
+function parseJudgeResponse(raw, criteria) {
+    const jsonMatch = raw.match(/\{[\s\S]*\}/);
+    if (!jsonMatch) {
+        throw new Error(`Judge returned no JSON object: "${raw.slice(0, 200)}"`);
+    }
+    const parsed = JSON.parse(jsonMatch[0]);
+    const verdict = parsed.verdict;
+    if (verdict !== "pass" && verdict !== "fail" && verdict !== "partial") {
+        throw new Error(`Judge returned invalid verdict: "${verdict}"`);
+    }
+    return {
+        verdict: verdict,
+        reasoning: String(parsed.reasoning ?? ""),
+        criteria,
+    };
+}
+export async function callJudge(response, criteria, executor) {
+    const prompt = buildJudgePrompt(response, criteria);
+    let raw;
+    try {
+        raw = await executor(prompt);
+    }
+    catch (err) {
+        throw new Error(`Judge executor failed: ${err.message}`);
+    }
+    try {
+        return parseJudgeResponse(raw, criteria.criteria);
+    }
+    catch (firstErr) {
+        // Retry once on parse failure
+        try {
+            raw = await executor(prompt);
+            return parseJudgeResponse(raw, criteria.criteria);
+        }
+        catch {
+            throw firstErr;
+        }
+    }
+}