@sebastiantuyu/agest 0.1.6 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,5 +1,7 @@
1
1
  # Agest
2
2
 
3
+ [![Build Status](https://github.com/sebastiantuyu/agest/actions/workflows/publish.yml/badge.svg)](https://github.com/sebastiantuyu/agest/actions/workflows/publish.yml)
4
+
3
5
  A quantitative testing library for agents using a Jest-like syntax.
4
6
  Batteries included.
5
7
 
@@ -1 +1,3 @@
1
1
  export { langchain } from "./langchain";
2
+ export { remote } from "./remote";
3
+ export type { RemoteAdapterOptions } from "./remote";
@@ -1 +1,2 @@
1
1
  export { langchain } from "./langchain";
2
+ export { remote } from "./remote";
@@ -0,0 +1,58 @@
1
+ import type { AgentExecutor, AgentResponse } from "../types";
2
+ export interface RemoteAdapterOptions {
3
+ /** HTTP headers (e.g. Authorization) */
4
+ headers?: Record<string, string>;
5
+ /** HTTP method, defaults to POST */
6
+ method?: "POST" | "PUT" | "GET";
7
+ /**
8
+ * Build the request body from the input prompt.
9
+ * Defaults to `{ prompt: input }`.
10
+ */
11
+ buildRequest?: (input: string) => unknown;
12
+ /**
13
+ * Parse the raw response body into an AgentResponse.
14
+ * When omitted the adapter tries common shapes:
15
+ * - `{ text }` / `{ response }` / `{ output }` / `{ message }` / plain string
16
+ */
17
+ parseResponse?: (body: unknown) => AgentResponse;
18
+ /**
19
+ * Static metadata for this remote agent.
20
+ * Because the remote endpoint is opaque, metadata like model name,
21
+ * tools, and system prompt must be provided manually here.
22
+ */
23
+ metadata?: {
24
+ model?: string;
25
+ tokens?: {
26
+ input: number;
27
+ output: number;
28
+ };
29
+ tools?: string[];
30
+ systemPrompt?: string;
31
+ [key: string]: unknown;
32
+ };
33
+ }
34
+ /**
35
+ * Adapter for remote agents exposed via HTTP endpoints.
36
+ *
37
+ * Since the remote agent is a black box, metadata (model, tools, etc.)
38
+ * must be supplied manually through `options.metadata`. If the endpoint
39
+ * returns token usage or other metadata, provide a `parseResponse`
40
+ * function to extract it.
41
+ *
42
+ * @example
43
+ * ```ts
44
+ * import { remote } from "agest/adapters";
45
+ *
46
+ * const executor = remote("https://my-agent.example.com/chat", {
47
+ * headers: { Authorization: "Bearer sk-..." },
48
+ * metadata: { model: "gpt-4o", tools: ["search", "calculator"] },
49
+ * });
50
+ *
51
+ * await agent(executor, () => {
52
+ * scene("What is 2+2?").expect("response", (r) => {
53
+ * expect(r).toBe.containing("4");
54
+ * });
55
+ * });
56
+ * ```
57
+ */
58
+ export declare function remote(endpoint: string, options?: RemoteAdapterOptions): AgentExecutor;
@@ -0,0 +1,127 @@
1
+ /**
2
+ * Adapter for remote agents exposed via HTTP endpoints.
3
+ *
4
+ * Since the remote agent is a black box, metadata (model, tools, etc.)
5
+ * must be supplied manually through `options.metadata`. If the endpoint
6
+ * returns token usage or other metadata, provide a `parseResponse`
7
+ * function to extract it.
8
+ *
9
+ * @example
10
+ * ```ts
11
+ * import { remote } from "agest/adapters";
12
+ *
13
+ * const executor = remote("https://my-agent.example.com/chat", {
14
+ * headers: { Authorization: "Bearer sk-..." },
15
+ * metadata: { model: "gpt-4o", tools: ["search", "calculator"] },
16
+ * });
17
+ *
18
+ * await agent(executor, () => {
19
+ * scene("What is 2+2?").expect("response", (r) => {
20
+ * expect(r).toBe.containing("4");
21
+ * });
22
+ * });
23
+ * ```
24
+ */
25
+ export function remote(endpoint, options = {}) {
26
+ const { headers = {}, method = "POST", buildRequest = defaultBuildRequest, parseResponse, metadata: staticMetadata, } = options;
27
+ return async (input) => {
28
+ let res;
29
+ try {
30
+ const fetchOptions = {
31
+ method,
32
+ headers: { "Content-Type": "application/json", ...headers },
33
+ };
34
+ if (method !== "GET") {
35
+ fetchOptions.body = JSON.stringify(buildRequest(input));
36
+ }
37
+ res = await fetch(endpoint, fetchOptions);
38
+ }
39
+ catch (err) {
40
+ return {
41
+ text: "",
42
+ executionError: `Request failed: ${err.message}`,
43
+ metadata: staticMetadata,
44
+ };
45
+ }
46
+ if (!res.ok) {
47
+ return {
48
+ text: "",
49
+ executionError: `HTTP ${res.status}: ${res.statusText}`,
50
+ metadata: staticMetadata,
51
+ };
52
+ }
53
+ const contentType = res.headers.get("content-type") ?? "";
54
+ let body;
55
+ if (contentType.includes("application/json")) {
56
+ body = await res.json();
57
+ }
58
+ else {
59
+ body = await res.text();
60
+ }
61
+ if (parseResponse) {
62
+ const parsed = parseResponse(body);
63
+ return {
64
+ ...parsed,
65
+ metadata: { ...staticMetadata, ...parsed.metadata },
66
+ };
67
+ }
68
+ const text = extractText(body);
69
+ return {
70
+ text,
71
+ metadata: {
72
+ ...staticMetadata,
73
+ ...extractResponseMetadata(body),
74
+ },
75
+ };
76
+ };
77
+ }
78
+ function defaultBuildRequest(input) {
79
+ return { prompt: input };
80
+ }
81
+ function extractText(body) {
82
+ if (typeof body === "string")
83
+ return body;
84
+ if (typeof body !== "object" || body === null)
85
+ return String(body);
86
+ const obj = body;
87
+ for (const key of ["text", "response", "output", "message", "content", "answer"]) {
88
+ if (typeof obj[key] === "string")
89
+ return obj[key];
90
+ }
91
+ // Try nested: { data: { text } }, { result: { output } }
92
+ for (const wrapper of ["data", "result"]) {
93
+ if (typeof obj[wrapper] === "object" && obj[wrapper] !== null) {
94
+ const nested = obj[wrapper];
95
+ for (const key of ["text", "response", "output", "message", "content", "answer"]) {
96
+ if (typeof nested[key] === "string")
97
+ return nested[key];
98
+ }
99
+ }
100
+ }
101
+ return JSON.stringify(body);
102
+ }
103
+ function extractResponseMetadata(body) {
104
+ if (typeof body !== "object" || body === null)
105
+ return undefined;
106
+ const obj = body;
107
+ const meta = {};
108
+ if (typeof obj.model === "string")
109
+ meta.model = obj.model;
110
+ // Try to find token usage in common locations
111
+ const usage = obj.usage ?? obj.token_usage ?? obj.tokens ??
112
+ (typeof obj.metadata === "object" && obj.metadata !== null
113
+ ? obj.metadata.usage ??
114
+ obj.metadata.tokens
115
+ : undefined);
116
+ if (typeof usage === "object" && usage !== null) {
117
+ const u = usage;
118
+ const input = (u.input_tokens ?? u.prompt_tokens ?? u.promptTokens ?? u.input);
119
+ const output = (u.output_tokens ?? u.completion_tokens ?? u.completionTokens ?? u.output);
120
+ if (input !== undefined || output !== undefined) {
121
+ meta.tokens = { input: input ?? 0, output: output ?? 0 };
122
+ }
123
+ }
124
+ if (typeof obj.refusal === "boolean")
125
+ meta.refusal = obj.refusal;
126
+ return Object.keys(meta).length > 0 ? meta : undefined;
127
+ }
@@ -1,8 +1,15 @@
1
+ import type { JudgeCriteria } from "./judge";
2
+ export interface PendingJudgement {
3
+ value: unknown;
4
+ criteria: JudgeCriteria;
5
+ }
6
+ export declare function collectPendingJudgements(): PendingJudgement[];
1
7
  export interface AgentMatchers {
2
8
  refusal(): void;
3
9
  notRefusal(): void;
4
10
  containing(text: string): void;
5
11
  matchingPattern(regex: RegExp): void;
12
+ judgedBy(criteria: JudgeCriteria): void;
6
13
  }
7
14
  export interface AgentExpectation {
8
15
  readonly toBe: AgentMatchers;
@@ -1,4 +1,10 @@
1
1
  import { isRefusal } from "./refusal";
2
+ let pendingJudgements = [];
3
+ export function collectPendingJudgements() {
4
+ const collected = pendingJudgements;
5
+ pendingJudgements = [];
6
+ return collected;
7
+ }
2
8
  export function expect(value) {
3
9
  return {
4
10
  get toBe() {
@@ -31,6 +37,9 @@ export function expect(value) {
31
37
  throw new Error(`Expected response to match ${regex} but got: "${actual.slice(0, 100)}"`);
32
38
  }
33
39
  },
40
+ judgedBy(criteria) {
41
+ pendingJudgements.push({ value, criteria });
42
+ },
34
43
  };
35
44
  },
36
45
  };
@@ -0,0 +1,19 @@
1
+ export type JudgeExecutor = (prompt: string) => Promise<string>;
2
+ export interface JudgeConfig {
3
+ /** Model identifier passed to the OpenAI-compatible API. Defaults to "openai/gpt-oss-20b". */
4
+ model?: string;
5
+ /** API key. Defaults to OPENROUTER_API_KEY then OPENAI_API_KEY env vars. */
6
+ apiKey?: string;
7
+ /** Base URL for the chat completions endpoint. Defaults to "https://openrouter.ai/api/v1". */
8
+ baseUrl?: string;
9
+ /** Fully custom judge function. When provided, model/apiKey/baseUrl are ignored. */
10
+ executor?: JudgeExecutor;
11
+ }
12
+ export interface AgestConfig {
13
+ parallelism?: number;
14
+ timeout?: number;
15
+ turns?: number;
16
+ judge?: JudgeConfig;
17
+ }
18
+ export declare function defineConfig(config: AgestConfig): AgestConfig;
19
+ export declare function loadConfig(): Promise<AgestConfig>;
package/dist/config.js ADDED
@@ -0,0 +1,19 @@
1
+ import path from "path";
2
+ export function defineConfig(config) {
3
+ return config;
4
+ }
5
+ export async function loadConfig() {
6
+ const candidates = [
7
+ path.join(process.cwd(), "agest.config.ts"),
8
+ path.join(process.cwd(), "agest.config.js"),
9
+ ];
10
+ for (const candidate of candidates) {
11
+ try {
12
+ const mod = await import(candidate);
13
+ return (mod.default ?? mod);
14
+ }
15
+ catch {
16
+ }
17
+ }
18
+ return {};
19
+ }
package/dist/context.d.ts CHANGED
@@ -2,16 +2,22 @@ import type { AgentExecutor, AgentReport, SceneDefinition } from "./types";
2
2
  export declare class SceneBuilder {
3
3
  private _prompt;
4
4
  private _assertions;
5
+ private _timeout?;
6
+ private _turns?;
5
7
  constructor(_prompt: string);
8
+ timeout(ms: number): SceneBuilder;
9
+ turns(n: number): SceneBuilder;
6
10
  expect(field: string, fn: (value: any) => void): SceneBuilder;
7
11
  toDefinition(): SceneDefinition;
8
12
  }
9
13
  export declare class AgentContext {
10
14
  private _executor;
15
+ private _name?;
11
16
  private _scenes;
12
- constructor(_executor: AgentExecutor);
17
+ constructor(_executor: AgentExecutor, _name?: string | undefined);
13
18
  registerScene(prompt: string): SceneBuilder;
14
19
  execute(): Promise<AgentReport>;
15
20
  }
21
+ export declare function hashPromptOnly(prompt: string): string;
16
22
  export declare function setContext(ctx: AgentContext | null): void;
17
23
  export declare function getContext(): AgentContext;
package/dist/context.js CHANGED
@@ -1,26 +1,40 @@
1
1
  import { createHash } from "crypto";
2
2
  import { executeScene } from "./runner";
3
- import { formatReport, writeReport } from "./reporter";
3
+ import { formatReport, writeReport, writeDiffEntry } from "./reporter";
4
4
  import { logger, c } from "./logger";
5
+ import { loadConfig } from "./config";
6
+ import { PromisePool } from "@supercharge/promise-pool";
5
7
  export class SceneBuilder {
6
8
  _prompt;
7
9
  _assertions = [];
10
+ _timeout;
11
+ _turns;
8
12
  constructor(_prompt) {
9
13
  this._prompt = _prompt;
10
14
  }
15
+ timeout(ms) {
16
+ this._timeout = ms;
17
+ return this;
18
+ }
19
+ turns(n) {
20
+ this._turns = n;
21
+ return this;
22
+ }
11
23
  expect(field, fn) {
12
24
  this._assertions.push({ field, fn });
13
25
  return this;
14
26
  }
15
27
  toDefinition() {
16
- return { prompt: this._prompt, assertions: [...this._assertions] };
28
+ return { prompt: this._prompt, assertions: [...this._assertions], timeout: this._timeout, turns: this._turns };
17
29
  }
18
30
  }
19
31
  export class AgentContext {
20
32
  _executor;
33
+ _name;
21
34
  _scenes = [];
22
- constructor(_executor) {
35
+ constructor(_executor, _name) {
23
36
  this._executor = _executor;
37
+ this._name = _name;
24
38
  }
25
39
  registerScene(prompt) {
26
40
  const builder = new SceneBuilder(prompt);
@@ -28,32 +42,39 @@ export class AgentContext {
28
42
  return builder;
29
43
  }
30
44
  async execute() {
45
+ const config = await loadConfig();
46
+ const parallelism = Math.max(1, config.parallelism ?? 1);
31
47
  const definitions = this._scenes.map((s) => s.toDefinition());
32
- const results = [];
33
- let totalDuration = 0;
48
+ const orderedResults = new Array(definitions.length);
34
49
  const total = definitions.length;
35
- logger.info(c.bold(`\nRunning ${total} scene${total !== 1 ? "s" : ""}...\n`));
36
- for (let i = 0; i < definitions.length; i++) {
37
- const scene = definitions[i];
50
+ logger.info(c.bold(`\nRunning ${total} scene${total !== 1 ? "s" : ""}${parallelism > 1 ? c.dim(` (parallelism: ${parallelism})`) : ""}...\n`));
51
+ const tasks = definitions.map((scene, i) => async () => {
38
52
  const label = scene.prompt.length > 60
39
53
  ? scene.prompt.slice(0, 57) + "..."
40
54
  : scene.prompt;
41
- logger.write(` ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... `);
42
- const result = await executeScene(this._executor, scene);
43
- results.push(result);
44
- totalDuration += result.duration;
55
+ const result = await executeScene(this._executor, scene, config.timeout, config.judge, config.turns);
56
+ orderedResults[i] = result;
45
57
  const ms = result.duration.toFixed(0);
46
58
  if (result.passed) {
47
- logger.info(c.green(`PASS`) + c.dim(` (${ms}ms)`));
59
+ logger.info(` ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.green("PASS")}${c.dim(` (${ms}ms)`)}`);
60
+ }
61
+ else if (result.judgement?.verdict === "partial") {
62
+ logger.info(` ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.yellow("PARTIAL")}${c.dim(` (${ms}ms)`)}`);
63
+ if (result.error) {
64
+ logger.info(` ${c.yellow(result.error)}`);
65
+ }
48
66
  }
49
67
  else {
50
- logger.info(c.red(`FAIL`) + c.dim(` (${ms}ms)`));
68
+ logger.info(` ${c.cyan(`[${i + 1}/${total}]`)} ${label} ... ${c.red("FAIL")}${c.dim(` (${ms}ms)`)}`);
51
69
  if (result.error) {
52
70
  logger.info(` ${c.red(result.error)}`);
53
71
  }
54
72
  }
55
73
  logger.debug(` response: ${result.response.text?.slice(0, 120)}`);
56
- }
74
+ });
75
+ await PromisePool.withConcurrency(parallelism).for(tasks).process((task) => task());
76
+ const results = orderedResults;
77
+ let totalDuration = results.reduce((sum, r) => sum + r.duration, 0);
57
78
  logger.info("");
58
79
  const failedResults = results.filter((r) => !r.passed);
59
80
  const failedCases = failedResults.map((r) => r.prompt);
@@ -75,11 +96,25 @@ export class AgentContext {
75
96
  }
76
97
  const firstMeta = results.find((r) => r.response.metadata)?.response
77
98
  .metadata;
99
+ const dimensions = {};
100
+ if (firstMeta?.model)
101
+ dimensions.model = firstMeta.model;
102
+ if (firstMeta?.systemPrompt)
103
+ dimensions.prompt = hashPromptOnly(firstMeta.systemPrompt);
104
+ if (firstMeta?.tools?.length)
105
+ dimensions.tools = [...firstMeta.tools].sort().join(",");
106
+ else
107
+ dimensions.tools = "none";
78
108
  const report = {
109
+ name: this._name,
79
110
  model: firstMeta?.model,
80
111
  systemPromptHash: firstMeta?.systemPrompt
81
- ? hashPrompt(firstMeta.systemPrompt)
112
+ ? hashPrompt(firstMeta.systemPrompt, firstMeta.model)
113
+ : undefined,
114
+ promptHash: firstMeta?.systemPrompt
115
+ ? hashPromptOnly(firstMeta.systemPrompt)
82
116
  : undefined,
117
+ dimensions,
83
118
  tools: firstMeta?.tools,
84
119
  successRate,
85
120
  failedCases,
@@ -91,14 +126,21 @@ export class AgentContext {
91
126
  averageOutputTokensPerCase,
92
127
  results,
93
128
  };
129
+ if (report.systemPromptHash && firstMeta?.systemPrompt) {
130
+ await writeDiffEntry(report.systemPromptHash, firstMeta.systemPrompt, report.tools ?? [], report.model);
131
+ }
94
132
  const formatted = formatReport(report);
95
133
  logger.info(formatted);
96
- const filepath = await writeReport(formatted, report.timestamp);
134
+ const filepath = await writeReport(formatted, report.timestamp, report.name, report.dimensions);
97
135
  logger.info(`\n${c.dim("Report saved to:")} ${c.cyan(filepath)}`);
98
136
  return report;
99
137
  }
100
138
  }
101
- function hashPrompt(prompt) {
139
+ function hashPrompt(prompt, model) {
140
+ const input = model ? `${model}:${prompt}` : prompt;
141
+ return createHash("sha256").update(input).digest("hex").slice(0, 12);
142
+ }
143
+ export function hashPromptOnly(prompt) {
102
144
  return createHash("sha256").update(prompt).digest("hex").slice(0, 12);
103
145
  }
104
146
  let currentContext = null;
package/dist/index.d.ts CHANGED
@@ -2,8 +2,14 @@ import type { AgentExecutor, AgentReport } from "./types";
2
2
  import { SceneBuilder } from "./context";
3
3
  export { expect } from "./assertions";
4
4
  export { logger } from "./logger";
5
+ export { defineConfig } from "./config";
6
+ export type { AgestConfig, JudgeConfig, JudgeExecutor } from "./config";
5
7
  export type { LogLevel } from "./logger";
6
8
  export type { AgentExpectation, AgentMatchers } from "./assertions";
7
- export type { AgentExecutor, AgentResponse, AgentReport, SceneResult, } from "./types";
9
+ export type { JudgeCriteria } from "./judge";
10
+ export type { AgentExecutor, AgentResponse, AgentReport, SceneResult, JudgeVerdict, JudgeResult, } from "./types";
11
+ export interface AgentOptions {
12
+ name?: string;
13
+ }
8
14
  export declare function scene(prompt: string): SceneBuilder;
9
- export declare function agent(executor: AgentExecutor, fn: () => void): Promise<AgentReport>;
15
+ export declare function agent(executor: AgentExecutor, fn: () => void, options?: AgentOptions): Promise<AgentReport>;
package/dist/index.js CHANGED
@@ -1,11 +1,12 @@
1
1
  import { AgentContext, setContext, getContext } from "./context";
2
2
  export { expect } from "./assertions";
3
3
  export { logger } from "./logger";
4
+ export { defineConfig } from "./config";
4
5
  export function scene(prompt) {
5
6
  return getContext().registerScene(prompt);
6
7
  }
7
- export async function agent(executor, fn) {
8
- const ctx = new AgentContext(executor);
8
+ export async function agent(executor, fn, options) {
9
+ const ctx = new AgentContext(executor, options?.name);
9
10
  setContext(ctx);
10
11
  try {
11
12
  fn();
@@ -0,0 +1,9 @@
1
+ import type { JudgeResult } from "./types";
2
+ import type { JudgeConfig, JudgeExecutor } from "./config";
3
+ export declare function resolveJudgeExecutor(config: JudgeConfig): JudgeExecutor;
4
+ export interface JudgeCriteria {
5
+ criteria: string;
6
+ failWhen: string;
7
+ context?: string;
8
+ }
9
+ export declare function callJudge(response: string, criteria: JudgeCriteria, executor: JudgeExecutor): Promise<JudgeResult>;
package/dist/judge.js ADDED
@@ -0,0 +1,101 @@
1
+ const DEFAULT_JUDGE_MODEL = "openai/gpt-oss-20b";
2
+ const DEFAULT_BASE_URL = "https://openrouter.ai/api/v1";
3
+ function buildFetchExecutor(config) {
4
+ const model = config.model ?? DEFAULT_JUDGE_MODEL;
5
+ const baseUrl = config.baseUrl ?? DEFAULT_BASE_URL;
6
+ const apiKey = config.apiKey ??
7
+ process.env.OPENROUTER_API_KEY ??
8
+ process.env.OPENAI_API_KEY ??
9
+ "";
10
+ return async (prompt) => {
11
+ const res = await fetch(`${baseUrl}/chat/completions`, {
12
+ method: "POST",
13
+ headers: {
14
+ "Content-Type": "application/json",
15
+ Authorization: `Bearer ${apiKey}`,
16
+ },
17
+ body: JSON.stringify({
18
+ model,
19
+ messages: [{ role: "user", content: prompt }],
20
+ temperature: 0,
21
+ }),
22
+ });
23
+ if (!res.ok) {
24
+ const text = await res.text();
25
+ throw new Error(`Judge API error ${res.status}: ${text.slice(0, 200)}`);
26
+ }
27
+ const data = (await res.json());
28
+ return data.choices[0]?.message?.content ?? "";
29
+ };
30
+ }
31
+ export function resolveJudgeExecutor(config) {
32
+ if (config.executor)
33
+ return config.executor;
34
+ return buildFetchExecutor(config);
35
+ }
36
+ const JUDGE_SYSTEM_PROMPT = `You are an evaluation judge for an AI agent's response. Evaluate the response against the provided criteria.
37
+ Return EXACTLY one JSON object with these fields:
38
+ - "verdict": one of "pass", "fail", or "partial"
39
+ - "reasoning": a brief explanation (1-2 sentences)
40
+
41
+ Rules:
42
+ - "pass": The response fully satisfies the success criteria with no issues.
43
+ - "partial": The response partially meets the criteria but has notable gaps or minor issues.
44
+ - "fail": The response meets the failure conditions or fundamentally misses the criteria.
45
+
46
+ Respond with ONLY the JSON object, no other text.`;
47
+ function buildJudgePrompt(response, criteria) {
48
+ let prompt = `${JUDGE_SYSTEM_PROMPT}
49
+
50
+ ## Agent Response
51
+ ${response}
52
+
53
+ ## Success Criteria
54
+ ${criteria.criteria}
55
+
56
+ ## Failure Conditions
57
+ ${criteria.failWhen}`;
58
+ if (criteria.context) {
59
+ prompt += `\n\n## Additional Context\n${criteria.context}`;
60
+ }
61
+ return prompt;
62
+ }
63
+ function parseJudgeResponse(raw, criteria) {
64
+ const jsonMatch = raw.match(/\{[\s\S]*\}/);
65
+ if (!jsonMatch) {
66
+ throw new Error(`Judge returned no JSON object: "${raw.slice(0, 200)}"`);
67
+ }
68
+ const parsed = JSON.parse(jsonMatch[0]);
69
+ const verdict = parsed.verdict;
70
+ if (verdict !== "pass" && verdict !== "fail" && verdict !== "partial") {
71
+ throw new Error(`Judge returned invalid verdict: "${verdict}"`);
72
+ }
73
+ return {
74
+ verdict: verdict,
75
+ reasoning: String(parsed.reasoning ?? ""),
76
+ criteria,
77
+ };
78
+ }
79
+ export async function callJudge(response, criteria, executor) {
80
+ const prompt = buildJudgePrompt(response, criteria);
81
+ let raw;
82
+ try {
83
+ raw = await executor(prompt);
84
+ }
85
+ catch (err) {
86
+ throw new Error(`Judge executor failed: ${err.message}`);
87
+ }
88
+ try {
89
+ return parseJudgeResponse(raw, criteria.criteria);
90
+ }
91
+ catch (firstErr) {
92
+ // Retry once on parse failure
93
+ try {
94
+ raw = await executor(prompt);
95
+ return parseJudgeResponse(raw, criteria.criteria);
96
+ }
97
+ catch {
98
+ throw firstErr;
99
+ }
100
+ }
101
+ }
@@ -0,0 +1 @@
1
+ export {};