claude-overnight 1.51.3 → 1.54.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -572,6 +572,20 @@ A fixed-plan `tasks.json` (without `flexiblePlan: true`) bypasses orchestration
572
572
  | `1` | Some tasks failed |
573
573
  | `2` | All failed or none completed |
574
574
 
575
+ ## Prompt evolution (server-side)
576
+
577
+ The `src/prompt-evolution/` engine and `claude-overnight-evolve` CLI power a self-evolution pipeline that optimises prompts (the planner prompt here, MCP-browser's supervisor prompts, or any prompt in a user's repo) via Pareto-frontier mutation with LLM-as-judge and heuristic scoring.
578
+
579
+ **Multi-hour runs aren't meant for your laptop.** Three ways to run it:
580
+
581
+ 1. **`npx claude-overnight-evolve …`** — quickest. Fine for smoke tests or short runs; needs `ANTHROPIC_API_KEY` in env and keeps running only as long as your shell is open. Output: `~/.claude-overnight/prompt-evolution/<runId>/`.
582
+ 2. **Self-hosted Docker** — [`self-host/`](self-host/README.md) ships a tiny runner image + optional HTTP server (enqueue + read-back) you can run on any VPS. Laptop can be off.
583
+ 3. **Fornace hosted** — already have a fornace project? `POST /api/projects/:id/prompt-evolution/enqueue` runs the same engine in your project's container. Body: `{ prompt, target, evalModel, generations, population, env?, anthropicApiKey?, anthropicBaseUrl?, anthropicModel? }`. Poll `GET /:runId` for status + inline `report.md`. See the [fornace integration doc § 2.6](https://github.com/Fornace/MCP-Browser/blob/main/docs/integration.md#26-prompt-evolution--apiprojectsidprompt-evolution).
584
+
585
+ Experiment credentials — any Anthropic-compatible provider (Anthropic direct, OpenRouter, Kimi, DashScope, a local proxy) — are injected via env vars: `ANTHROPIC_BASE_URL`, `ANTHROPIC_API_KEY`, `EVAL_MODEL`, `MUTATE_MODEL`. Self-host reads them from `self-host/.env` (or per-run `env:` in the enqueue body).
586
+
587
+ Full design: [docs/prompt-evolution-research.md](docs/prompt-evolution-research.md).
588
+
575
589
  ## License
576
590
 
577
591
  MIT
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * `claude-overnight-evolve` — CLI for the prompt-evolution engine.
4
+ *
5
+ * Ships with the npm package (compiled to dist/bin/evolve.js). The MCP-browser
6
+ * platform runs this binary inside a per-project `raw`-mode container via
7
+ * `docker exec`. See docs/prompt-evolution-research.md.
8
+ *
9
+ * Examples:
10
+ * claude-overnight-evolve --prompt 10_planning/10-3_plan --eval-model claude-haiku-4-5 --generations 3
11
+ * claude-overnight-evolve --target mcp-browser --prompt-kind plan-supervision --eval-model kimi-k2-6
12
+ *
13
+ * Requires ANTHROPIC_API_KEY (or ANTHROPIC_AUTH_TOKEN) in env. When `--target
14
+ * mcp-browser` is used the cwd must be the MCP-browser repo root (so
15
+ * `platform/supervisor/gemini-client.ts` resolves), or pass the file via
16
+ * `MCP_BROWSER_GEMINI_CLIENT`.
17
+ */
18
+ export {};
@@ -0,0 +1,185 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * `claude-overnight-evolve` — CLI for the prompt-evolution engine.
4
+ *
5
+ * Ships with the npm package (compiled to dist/bin/evolve.js). The MCP-browser
6
+ * platform runs this binary inside a per-project `raw`-mode container via
7
+ * `docker exec`. See docs/prompt-evolution-research.md.
8
+ *
9
+ * Examples:
10
+ * claude-overnight-evolve --prompt 10_planning/10-3_plan --eval-model claude-haiku-4-5 --generations 3
11
+ * claude-overnight-evolve --target mcp-browser --prompt-kind plan-supervision --eval-model kimi-k2-6
12
+ *
13
+ * Requires ANTHROPIC_API_KEY (or ANTHROPIC_AUTH_TOKEN) in env. When `--target
14
+ * mcp-browser` is used the cwd must be the MCP-browser repo root (so
15
+ * `platform/supervisor/gemini-client.ts` resolves), or pass the file via
16
+ * `MCP_BROWSER_GEMINI_CLIENT`.
17
+ */
18
+ import { evolvePrompt } from "../prompt-evolution/index.js";
19
+ import { PLAN_CASES } from "../prompt-evolution/fixtures/plan-cases.js";
20
+ import { scenariosToCases, PLANNING_SCENARIOS, REVIEW_SCENARIOS, SUPERVISION_SCENARIOS, STUCK_SCENARIOS, hydrateCases, extractPrompt, } from "../prompt-evolution/adapters/mcp-browser.js";
21
+ function help() {
22
+ process.stdout.write(`Usage: claude-overnight-evolve [options]
23
+
24
+ Options:
25
+ --target <name> claude-overnight | mcp-browser (default: claude-overnight)
26
+ --prompt <path> Prompt file path (claude-overnight)
27
+ --prompt-kind <kind> MCP-browser prompt kind: planning | review | evolution |
28
+ goal-refinement | plan-supervision | simple-supervision | stuck-analysis
29
+ --eval-model <model> Fast model for evaluation (default: claude-haiku-4-5)
30
+ --mutate-model <model> Smarter model for mutation (defaults to eval-model)
31
+ --generations <n> Number of evolution generations (default: 10)
32
+ --population <n> Max population size (default: 8)
33
+ --plateau <n> Stop early if no improvement for N generations (default: 3)
34
+ --cases <suite> Benchmark suite: plan | mcp-planning | mcp-review |
35
+ mcp-supervision | mcp-stuck (default: plan)
36
+ --base-url <url> API base URL override
37
+ --auth-token <token> Auth token override
38
+ --run-id <id> Preset run id (default: auto-generated)
39
+ `);
40
+ process.exit(0);
41
+ }
42
+ function parseArgs() {
43
+ const args = process.argv.slice(2);
44
+ if (args.includes("--help") || args.includes("-h"))
45
+ help();
46
+ const opts = {
47
+ target: "claude-overnight",
48
+ prompt: "10_planning/10-3_plan",
49
+ promptKind: "",
50
+ evalModel: process.env.EVAL_MODEL ?? "claude-haiku-4-5",
51
+ mutateModel: process.env.MUTATE_MODEL,
52
+ generations: 10,
53
+ population: 8,
54
+ plateau: 3,
55
+ cases: "",
56
+ baseUrl: process.env.ANTHROPIC_BASE_URL,
57
+ authToken: process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY,
58
+ };
59
+ for (let i = 0; i < args.length; i++) {
60
+ const v = args[i + 1];
61
+ switch (args[i]) {
62
+ case "--target":
63
+ opts.target = v;
64
+ i++;
65
+ break;
66
+ case "--prompt":
67
+ opts.prompt = v;
68
+ i++;
69
+ break;
70
+ case "--prompt-kind":
71
+ opts.promptKind = v;
72
+ i++;
73
+ break;
74
+ case "--eval-model":
75
+ opts.evalModel = v;
76
+ i++;
77
+ break;
78
+ case "--mutate-model":
79
+ opts.mutateModel = v;
80
+ i++;
81
+ break;
82
+ case "--generations":
83
+ opts.generations = parseInt(v, 10);
84
+ i++;
85
+ break;
86
+ case "--population":
87
+ opts.population = parseInt(v, 10);
88
+ i++;
89
+ break;
90
+ case "--plateau":
91
+ opts.plateau = parseInt(v, 10);
92
+ i++;
93
+ break;
94
+ case "--cases":
95
+ opts.cases = v;
96
+ i++;
97
+ break;
98
+ case "--base-url":
99
+ opts.baseUrl = v;
100
+ i++;
101
+ break;
102
+ case "--auth-token":
103
+ opts.authToken = v;
104
+ i++;
105
+ break;
106
+ case "--run-id":
107
+ opts.runId = v;
108
+ i++;
109
+ break;
110
+ }
111
+ }
112
+ if (opts.target === "mcp-browser" && !opts.cases) {
113
+ opts.cases = `mcp-${opts.promptKind || "planning"}`;
114
+ }
115
+ if (!opts.cases)
116
+ opts.cases = "plan";
117
+ return opts;
118
+ }
119
+ async function main() {
120
+ const opts = parseArgs();
121
+ let cases;
122
+ let promptPath = opts.prompt;
123
+ let seedText;
124
+ if (opts.target === "mcp-browser") {
125
+ const kind = (opts.promptKind || "planning");
126
+ const scenarioMap = {
127
+ planning: PLANNING_SCENARIOS,
128
+ review: REVIEW_SCENARIOS,
129
+ evolution: [],
130
+ "goal-refinement": [],
131
+ "plan-supervision": SUPERVISION_SCENARIOS,
132
+ "simple-supervision": SUPERVISION_SCENARIOS,
133
+ "stuck-analysis": STUCK_SCENARIOS,
134
+ };
135
+ cases = hydrateCases(scenariosToCases(kind, scenarioMap[kind]));
136
+ promptPath = `mcp-browser/${kind}`;
137
+ seedText = extractPrompt(kind);
138
+ }
139
+ else {
140
+ if (opts.cases === "plan")
141
+ cases = PLAN_CASES;
142
+ else
143
+ throw new Error(`Unknown case suite: ${opts.cases}`);
144
+ }
145
+ console.log(`Evolution config:`);
146
+ console.log(` target: ${opts.target}`);
147
+ console.log(` prompt: ${promptPath}`);
148
+ console.log(` evalModel: ${opts.evalModel}`);
149
+ console.log(` mutateModel: ${opts.mutateModel ?? opts.evalModel}`);
150
+ console.log(` generations: ${opts.generations}`);
151
+ console.log(` population: ${opts.population}`);
152
+ console.log(` plateau: ${opts.plateau}`);
153
+ console.log(` cases: ${cases.length} (${opts.cases})`);
154
+ console.log("");
155
+ const result = await evolvePrompt({
156
+ promptPath,
157
+ cases,
158
+ evalModel: opts.evalModel,
159
+ mutateModel: opts.mutateModel,
160
+ generations: opts.generations,
161
+ populationCap: opts.population,
162
+ plateauGenerations: opts.plateau,
163
+ baseUrl: opts.baseUrl,
164
+ authToken: opts.authToken,
165
+ seedText,
166
+ target: opts.target,
167
+ runId: opts.runId,
168
+ onLog: (text) => console.log(text),
169
+ });
170
+ console.log("\n=== BEST VARIANT ===");
171
+ console.log(`id: ${result.bestVariant.variantId}`);
172
+ console.log(`generation: ${result.bestVariant.generation}`);
173
+ console.log(`gmean: ${(result.bestVariant.gmean * 100).toFixed(1)}%`);
174
+ console.log(`parse: ${(result.bestVariant.aggregate.parse * 100).toFixed(1)}%`);
175
+ console.log(`schema: ${(result.bestVariant.aggregate.schema * 100).toFixed(1)}%`);
176
+ console.log(`content: ${(result.bestVariant.aggregate.content * 100).toFixed(1)}%`);
177
+ console.log(`cost: ${(result.bestVariant.aggregate.costEfficiency * 100).toFixed(1)}%`);
178
+ console.log(`speed: ${(result.bestVariant.aggregate.speed * 100).toFixed(1)}%`);
179
+ console.log("\n--- Prompt text ---");
180
+ console.log(result.bestVariant.text);
181
+ }
182
+ main().catch((err) => {
183
+ console.error(err);
184
+ process.exit(1);
185
+ });
@@ -1 +1 @@
1
- export declare const VERSION = "1.51.3";
1
+ export declare const VERSION = "1.53.0";
@@ -1,2 +1,2 @@
1
1
  // Auto-generated by build — do not edit manually.
2
- export const VERSION = "1.51.3";
2
+ export const VERSION = "1.53.0";
@@ -12,10 +12,28 @@
12
12
  */
13
13
  import { readFileSync } from "node:fs";
14
14
  import { resolve } from "node:path";
15
- const GEMINI_CLIENT_PATH = resolve("/Users/francesco/works/repos/MCP-browser/platform/supervisor/gemini-client.ts");
15
+ /**
16
+ * Resolve the path to `gemini-client.ts` inside whichever MCP-browser checkout
17
+ * is in scope. Order:
18
+ * 1. `MCP_BROWSER_GEMINI_CLIENT` env var — explicit override.
19
+ * 2. `MCP_BROWSER_REPO` env var — repo root, relative file is appended.
20
+ * 3. cwd — expected shape on fornace.net (inside the project's raw container
21
+ * the repo is cloned at `/workspace`, so `process.cwd()` resolves it).
22
+ *
23
+ * NEVER hardcode an absolute host path — this runs in a container on the
24
+ * server and on any contributor's laptop.
25
+ */
26
+ function resolveGeminiClientPath() {
27
+ const override = process.env.MCP_BROWSER_GEMINI_CLIENT;
28
+ if (override)
29
+ return resolve(override);
30
+ const repo = process.env.MCP_BROWSER_REPO ?? process.cwd();
31
+ return resolve(repo, "platform/supervisor/gemini-client.ts");
32
+ }
16
33
  /** Extract a const prompt string from gemini-client.ts by name */
17
34
  export function extractPrompt(kind) {
18
- const source = readFileSync(GEMINI_CLIENT_PATH, "utf-8");
35
+ const path = resolveGeminiClientPath();
36
+ const source = readFileSync(path, "utf-8");
19
37
  const nameMap = {
20
38
  planning: "PLANNING_PROMPT",
21
39
  review: "REVIEW_PROMPT",
@@ -29,7 +47,7 @@ export function extractPrompt(kind) {
29
47
  const pattern = new RegExp(`const ${constName} = \`([\\s\\S]*?)\`;`);
30
48
  const m = source.match(pattern);
31
49
  if (!m)
32
- throw new Error(`Prompt ${constName} not found in ${GEMINI_CLIENT_PATH}`);
50
+ throw new Error(`Prompt ${constName} not found in ${path}`);
33
51
  return m[1].trim();
34
52
  }
35
53
  /** Build a synthetic user prompt for a given kind and scenario */
@@ -76,12 +76,15 @@ async function runSingle(job, opts) {
76
76
  const baseUrl = (opts.baseUrl ?? process.env.ANTHROPIC_BASE_URL ?? "https://api.anthropic.com").replace(/\/$/, "");
77
77
  const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
78
78
  const isAnthropic = /^https?:\/\/(api\.)?anthropic\.com/i.test(baseUrl);
79
+ const isKimi = /kimi\.com/i.test(baseUrl);
79
80
  let body;
80
81
  let endpoint;
81
82
  let headers = {
82
83
  "Content-Type": "application/json",
83
84
  "Authorization": `Bearer ${authToken}`,
84
85
  };
86
+ if (isKimi)
87
+ headers["User-Agent"] = "Kilo-Code/1.0";
85
88
  if (isAnthropic) {
86
89
  // Anthropic native format
87
90
  endpoint = `${baseUrl}/v1/messages`;
@@ -17,7 +17,7 @@
17
17
  * 4. Mutate worst-performing variants using failure traces
18
18
  * 5. Repeat
19
19
  */
20
- import type { BenchmarkCase, VariantRow, LearningEntry } from "./types.js";
20
+ import type { BenchmarkCase, EvolutionResult } from "./types.js";
21
21
  export interface EvolveOpts {
22
22
  /** Prompt file path, e.g. "10_planning/10-3_plan" or "mcp-browser/planning" */
23
23
  promptPath: string;
@@ -31,6 +31,8 @@ export interface EvolveOpts {
31
31
  generations?: number;
32
32
  /** Population size cap */
33
33
  populationCap?: number;
34
+ /** Stop early if no improvement for N generations (default: 3) */
35
+ plateauGenerations?: number;
34
36
  /** Current canon gmean (0 if none) */
35
37
  canonGmean?: number;
36
38
  /** Optional logging callback */
@@ -41,10 +43,9 @@ export interface EvolveOpts {
41
43
  authToken?: string;
42
44
  /** Optional seed prompt text (for non-file prompts like MCP-browser) */
43
45
  seedText?: string;
44
- }
45
- export interface EvolutionResult {
46
- bestVariant: VariantRow;
47
- allRows: VariantRow[];
48
- learningLog: LearningEntry[];
46
+ /** Target project label for persistence */
47
+ target?: string;
48
+ /** Run ID override (auto-generated if omitted) */
49
+ runId?: string;
49
50
  }
50
51
  export declare function evolvePrompt(opts: EvolveOpts): Promise<EvolutionResult>;
@@ -21,12 +21,31 @@ import { renderPrompt } from "../prompts/load.js";
21
21
  import { buildMatrix, renderVariant } from "./evaluator.js";
22
22
  import { mutate } from "./mutator.js";
23
23
  import { curate, formatMatrix } from "./curator.js";
24
+ import { initRun, appendMatrix, appendLearning, snapshotPrompts, finalizeRun } from "./persistence.js";
25
+ import { generateReport } from "./report.js";
24
26
  export async function evolvePrompt(opts) {
25
27
  const log = opts.onLog ?? ((t) => process.stdout.write(t + "\n"));
26
- const generations = opts.generations ?? 3;
27
- const populationCap = opts.populationCap ?? 6;
28
+ const generations = opts.generations ?? 10;
29
+ const populationCap = opts.populationCap ?? 8;
30
+ const plateauGenerations = opts.plateauGenerations ?? 3;
28
31
  const mutateModel = opts.mutateModel ?? opts.evalModel;
29
32
  const canonGmean = opts.canonGmean ?? 0;
33
+ const target = opts.target ?? "claude-overnight";
34
+ const runId = opts.runId ?? `run_${Date.now().toString(36)}_${Math.random().toString(36).slice(2, 6)}`;
35
+ // ── 0. Initialise persistence ──
36
+ initRun({
37
+ runId,
38
+ promptPath: opts.promptPath,
39
+ target,
40
+ evalModel: opts.evalModel,
41
+ mutateModel,
42
+ generations,
43
+ populationCap,
44
+ startedAt: new Date().toISOString(),
45
+ status: "running",
46
+ caseNames: opts.cases.map((c) => c.name),
47
+ });
48
+ log(`Run directory: ~/.claude-overnight/prompt-evolution/${runId}/`);
30
49
  // ── 1. Seed population from existing variants or seed text ──
31
50
  let population = opts.seedText
32
51
  ? [{ id: "default", promptPath: opts.promptPath, generation: 0, text: opts.seedText }]
@@ -34,6 +53,8 @@ export async function evolvePrompt(opts) {
34
53
  log(`Seeded ${population.length} variants from ${opts.promptPath}`);
35
54
  const learningLog = [];
36
55
  let bestOverall = null;
56
+ const generationMatrices = [];
57
+ let generationsWithoutImprovement = 0;
37
58
  for (let gen = 0; gen < generations; gen++) {
38
59
  log(`\n=== Generation ${gen + 1}/${generations} | Population: ${population.length} ===`);
39
60
  // ── 2. Evaluate ──
@@ -47,11 +68,18 @@ export async function evolvePrompt(opts) {
47
68
  },
48
69
  };
49
70
  const matrix = await buildMatrix(population, opts.cases, evalOpts);
71
+ generationMatrices.push(matrix);
72
+ snapshotPrompts(runId, matrix);
73
+ appendMatrix(runId, gen, matrix);
50
74
  log(formatMatrix(matrix, opts.cases.map((c) => c.name)));
51
75
  // Track best
52
76
  const genBest = matrix.reduce((a, b) => (a.gmean > b.gmean ? a : b));
53
- if (!bestOverall || genBest.gmean > bestOverall.gmean) {
77
+ if (!bestOverall || genBest.gmean > bestOverall.gmean + 0.001) {
54
78
  bestOverall = genBest;
79
+ generationsWithoutImprovement = 0;
80
+ }
81
+ else {
82
+ generationsWithoutImprovement++;
55
83
  }
56
84
  // ── 3. Curate ──
57
85
  const curateOpts = {
@@ -71,6 +99,12 @@ export async function evolvePrompt(opts) {
71
99
  }));
72
100
  // ── 5. Mutate to refill ──
73
101
  const targetSize = Math.min(populationCap, keptRows.length + 2);
102
+ const newEntries = [];
103
+ // Early stopping check
104
+ if (generationsWithoutImprovement >= plateauGenerations && gen >= 2) {
105
+ log(`\n=== Early stop: no improvement for ${generationsWithoutImprovement} generations ===`);
106
+ break;
107
+ }
74
108
  if (nextPop.length < targetSize && gen < generations - 1) {
75
109
  const mutantsNeeded = targetSize - nextPop.length;
76
110
  log(`Generating ${mutantsNeeded} mutant(s)...`);
@@ -115,12 +149,14 @@ export async function evolvePrompt(opts) {
115
149
  generation: mutant.generation,
116
150
  text: mutant.text,
117
151
  });
118
- learningLog.push({
152
+ const entry = {
119
153
  generation: gen,
120
154
  mutationSummary: mutant.mutationSummary,
121
155
  fitnessDelta: 0, // filled next gen
122
156
  status: "neutral",
123
- });
157
+ };
158
+ learningLog.push(entry);
159
+ newEntries.push(entry);
124
160
  log(` Mutant ${mutant.variantId} ← ${parent.variantId}: ${mutant.mutationSummary}`);
125
161
  }
126
162
  catch (err) {
@@ -129,6 +165,8 @@ export async function evolvePrompt(opts) {
129
165
  }
130
166
  }
131
167
  }
168
+ if (newEntries.length)
169
+ appendLearning(runId, newEntries);
132
170
  population = nextPop;
133
171
  }
134
172
  // Final evaluation of surviving population
@@ -139,13 +177,36 @@ export async function evolvePrompt(opts) {
139
177
  authToken: opts.authToken,
140
178
  concurrency: 4,
141
179
  });
180
+ generationMatrices.push(finalMatrix);
181
+ snapshotPrompts(runId, finalMatrix);
182
+ appendMatrix(runId, generations, finalMatrix);
142
183
  log(formatMatrix(finalMatrix, opts.cases.map((c) => c.name)));
143
184
  const best = finalMatrix.reduce((a, b) => (a.gmean > b.gmean ? a : b));
144
- if (bestOverall && bestOverall.gmean > best.gmean) {
145
- // Return the historical best even if it didn't survive final cull
146
- return { bestVariant: bestOverall, allRows: finalMatrix, learningLog };
147
- }
148
- return { bestVariant: best, allRows: finalMatrix, learningLog };
185
+ const historicalBest = bestOverall && bestOverall.gmean > best.gmean ? bestOverall : best;
186
+ const result = {
187
+ bestVariant: historicalBest,
188
+ allRows: finalMatrix,
189
+ learningLog,
190
+ runId,
191
+ };
192
+ // Generate and save report
193
+ const baselineText = generationMatrices[0]?.find((r) => r.variantId === "default")?.text;
194
+ const report = generateReport({
195
+ runId,
196
+ promptPath: opts.promptPath,
197
+ target,
198
+ evalModel: opts.evalModel,
199
+ generations,
200
+ baselineText,
201
+ }, result, generationMatrices);
202
+ const { writeFileSync } = await import("node:fs");
203
+ const { runDir } = await import("./persistence.js");
204
+ const reportPath = `${runDir(runId)}/report.md`;
205
+ writeFileSync(reportPath, report);
206
+ result.reportPath = reportPath;
207
+ finalizeRun(runId, result);
208
+ log(`\nReport saved: ${reportPath}`);
209
+ return result;
149
210
  }
150
211
  // ── Helpers ──
151
212
  function seedPopulation(promptPath) {
@@ -0,0 +1,40 @@
1
+ /**
2
+ * LLM-as-judge scoring for prompt evolution.
3
+ *
4
+ * Inspired by Hermes Agent's autoresearch skill and self-evolution repo:
5
+ * - Structured rubric (5 criteria × 1-5 scale, normalised to 0-1)
6
+ * - The judge sees the prompt, the case, and the model's raw output
7
+ * - Returns both a scalar score and human-readable justification
8
+ *
9
+ * When to use:
10
+ * - Content criteria that are too fuzzy for deterministic regex
11
+ * (e.g. "was the plan creative?", "did the response follow the spirit of the prompt?")
12
+ * - Final validation gate before promoting a variant to canon
13
+ *
14
+ * Cost: ~1 judge call per case per generation (~$0.002-0.01 each).
15
+ */
16
+ import type { BenchmarkCase, ScoreDimensions } from "./types.js";
17
+ export interface JudgeOpts {
18
+ model: string;
19
+ baseUrl?: string;
20
+ authToken?: string;
21
+ maxTokens?: number;
22
+ timeoutMs?: number;
23
+ }
24
+ export interface JudgeResult {
25
+ /** 0-1 overall quality score */
26
+ score: number;
27
+ /** Dimension breakdown matching ScoreDimensions keys */
28
+ dimensions: Partial<ScoreDimensions>;
29
+ /** Human-readable rubric justification */
30
+ justification: string;
31
+ }
32
+ /**
33
+ * Score a single (case, output) pair with an LLM judge.
34
+ *
35
+ * The judge prompt is carefully structured to be reproducible:
36
+ * - Exact rubric with 1-5 Likert scale definitions
37
+ * - One-shot example in the prompt text
38
+ * - Forced JSON output schema
39
+ */
40
+ export declare function judgeOutput(rawOutput: string, c: BenchmarkCase, opts: JudgeOpts): Promise<JudgeResult>;
@@ -0,0 +1,158 @@
1
+ /**
2
+ * LLM-as-judge scoring for prompt evolution.
3
+ *
4
+ * Inspired by Hermes Agent's autoresearch skill and self-evolution repo:
5
+ * - Structured rubric (5 criteria × 1-5 scale, normalised to 0-1)
6
+ * - The judge sees the prompt, the case, and the model's raw output
7
+ * - Returns both a scalar score and human-readable justification
8
+ *
9
+ * When to use:
10
+ * - Content criteria that are too fuzzy for deterministic regex
11
+ * (e.g. "was the plan creative?", "did the response follow the spirit of the prompt?")
12
+ * - Final validation gate before promoting a variant to canon
13
+ *
14
+ * Cost: ~1 judge call per case per generation (~$0.002-0.01 each).
15
+ */
16
+ const DEFAULT_RUBRIC = [
17
+ { name: "parse", question: "Is the output well-formed and parseable (valid JSON if expected, clear structure otherwise)?" },
18
+ { name: "schema", question: "Does the output contain all required fields / follow the expected schema?" },
19
+ { name: "content", question: "Is the content accurate, relevant, and satisfying the user's intent?" },
20
+ { name: "concision", question: "Is the response concise without omitting necessary detail?" },
21
+ { name: "instruction", question: "Does the output follow the explicit and implicit instructions in the system prompt?" },
22
+ ];
23
+ /**
24
+ * Score a single (case, output) pair with an LLM judge.
25
+ *
26
+ * The judge prompt is carefully structured to be reproducible:
27
+ * - Exact rubric with 1-5 Likert scale definitions
28
+ * - One-shot example in the prompt text
29
+ * - Forced JSON output schema
30
+ */
31
+ export async function judgeOutput(rawOutput, c, opts) {
32
+ const baseUrl = (opts.baseUrl ?? process.env.ANTHROPIC_BASE_URL ?? "https://api.anthropic.com").replace(/\/$/, "");
33
+ const authToken = opts.authToken ?? process.env.ANTHROPIC_AUTH_TOKEN ?? process.env.ANTHROPIC_API_KEY ?? "";
34
+ const isKimi = /kimi\.com/i.test(baseUrl);
35
+ const prompt = buildJudgePrompt(rawOutput, c);
36
+ const body = JSON.stringify({
37
+ model: opts.model,
38
+ max_tokens: opts.maxTokens ?? 2048,
39
+ messages: [{ role: "user", content: prompt }],
40
+ });
41
+ const isAnthropic = /^https?:\/\/(api\.)?anthropic\.com/i.test(baseUrl);
42
+ const endpoint = isAnthropic ? `${baseUrl}/v1/messages` : `${baseUrl}/v1/chat/completions`;
43
+ const headers = {
44
+ "Content-Type": "application/json",
45
+ "Authorization": `Bearer ${authToken}`,
46
+ };
47
+ if (isAnthropic)
48
+ headers["anthropic-version"] = "2023-06-01";
49
+ if (isKimi)
50
+ headers["User-Agent"] = "Kilo-Code/1.0";
51
+ const res = await fetch(endpoint, {
52
+ method: "POST",
53
+ headers,
54
+ body,
55
+ signal: AbortSignal.timeout(opts.timeoutMs ?? 60_000),
56
+ });
57
+ if (!res.ok) {
58
+ const text = await res.text().catch(() => "");
59
+ throw new Error(`Judge HTTP ${res.status}: ${text.slice(0, 200)}`);
60
+ }
61
+ let raw = "";
62
+ if (isAnthropic) {
63
+ const data = await res.json();
64
+ raw = data.content?.map((c) => c.text ?? "").join("") ?? "";
65
+ }
66
+ else {
67
+ const data = await res.json();
68
+ raw = data.choices?.[0]?.message?.content ?? "";
69
+ }
70
+ return parseJudgeOutput(raw);
71
+ }
72
+ function buildJudgePrompt(rawOutput, c) {
73
+ const rubricLines = DEFAULT_RUBRIC
74
+ .map((r, i) => `${i + 1}. **${r.name}**: ${r.question}`)
75
+ .join("\n");
76
+ return `You are an expert prompt-evaluation judge. Your task is to score a language-model output against a benchmark case using a strict rubric.
77
+
78
+ ## Benchmark Case
79
+
80
+ - **Name**: ${c.name}
81
+ - **Prompt path**: ${c.promptPath}
82
+ - **Expected criteria**:
83
+ ${Object.entries(c.criteria).map(([k, v]) => ` - ${k}: ${v}`).join("\n")}
84
+
85
+ ## Model Output
86
+
87
+ \`\`\`
88
+ ${rawOutput.slice(0, 3000)}
89
+ \`\`\`
90
+
91
+ ## Rubric
92
+
93
+ Score each criterion on a 1-5 Likert scale:
94
+ - 5 = Excellent / exceeds expectations
95
+ - 4 = Good / meets expectations
96
+ - 3 = Acceptable / minor issues
97
+ - 2 = Poor / significant issues
98
+ - 1 = Unacceptable / fails completely
99
+
100
+ ${rubricLines}
101
+
102
+ ## Response Format
103
+
104
+ Respond ONLY with a JSON object in this exact shape (no markdown fences, no extra text):
105
+
106
+ {"parse":5,"schema":5,"content":4,"concision":4,"instruction":5,"justification":"Brief justification here."}
107
+ `;
108
+ }
109
+ function parseJudgeOutput(raw) {
110
+ // Strip fences
111
+ const cleaned = raw
112
+ .replace(/^\`\`\`(?:json)?\s*\n?/i, "")
113
+ .replace(/\n?\`\`\`\s*$/i, "")
114
+ .trim();
115
+ let obj;
116
+ try {
117
+ obj = JSON.parse(cleaned);
118
+ }
119
+ catch {
120
+ // Try to extract first JSON object
121
+ const m = cleaned.match(/\{[\s\S]*\}/);
122
+ if (!m) {
123
+ return { score: 0.5, dimensions: {}, justification: "Judge returned unparseable JSON. Falling back to neutral." };
124
+ }
125
+ try {
126
+ obj = JSON.parse(m[0]);
127
+ }
128
+ catch {
129
+ return { score: 0.5, dimensions: {}, justification: "Judge returned unparseable JSON. Falling back to neutral." };
130
+ }
131
+ }
132
+ const getNum = (k) => {
133
+ const v = obj[k];
134
+ if (typeof v === "number")
135
+ return Math.max(1, Math.min(5, v));
136
+ return 3; // neutral default
137
+ };
138
+ const parse = (getNum("parse") - 1) / 4;
139
+ const schema = (getNum("schema") - 1) / 4;
140
+ const content = (getNum("content") - 1) / 4;
141
+ const concision = (getNum("concision") - 1) / 4;
142
+ const instruction = (getNum("instruction") - 1) / 4;
143
+ // Map concision → costEfficiency, instruction → speed (proxy for "follows instructions quickly")
144
+ const dimensions = {
145
+ parse,
146
+ schema,
147
+ content,
148
+ costEfficiency: concision,
149
+ speed: instruction,
150
+ };
151
+ const vals = [parse, schema, content, concision, instruction];
152
+ const score = vals.reduce((a, b) => a + b, 0) / vals.length;
153
+ return {
154
+ score,
155
+ dimensions,
156
+ justification: typeof obj.justification === "string" ? obj.justification : "(no justification)",
157
+ };
158
+ }